- Linaro-mm-sig - lists.linaro.org

Re: [RFC PATCH 08/12] vfio/pci: Create host unaccessible dma-buf for private device

by Jason Gunthorpe

On Tue, Jan 07, 2025 at 10:27:15PM +0800, Xu Yilun wrote: > Add a flag for ioctl(VFIO_DEVICE_BIND_IOMMUFD) to mark a device as > for private assignment. For these private assigned devices, disallow > host accessing their MMIO resources. Why? Shouldn't the VMM simply not call mmap? Why does the kernel have to enforce this? Jason

5 months

1
0
0 0

[PATCH 6.12 080/114] drm/xe: Move the coredump registration to the worker thread

by Greg Kroah-Hartman

6.12-stable review patch. If anyone has any objections, please let me know. ------------------ From: John Harrison <John.C.Harrison(a)Intel.com> [ Upstream commit 5dce85fecb87751ec94526e1ac516dd7871e2e0c ] Adding lockdep checking to the coredump code showed that there was an existing violation. The dev_coredumpm_timeout() call is used to register the dump with the base coredump subsystem. However, that makes multiple memory allocations, only some of which use the GFP_ flags passed in. So that also needs to be deferred to the worker function where it is safe to allocate with arbitrary flags. In order to not add protoypes for the callback functions, moving the _timeout call also means moving the worker thread function to later in the file. v2: Rebased after other changes to the worker function. Fixes: e799485044cb ("drm/xe: Introduce the dev_coredump infrastructure.") Cc: Thomas Hellström <thomas.hellstrom(a)linux.intel.com> Cc: Matthew Brost <matthew.brost(a)intel.com> Cc: Jani Nikula <jani.nikula(a)linux.intel.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Francois Dugast <francois.dugast(a)intel.com> Cc: Rodrigo Vivi <rodrigo.vivi(a)intel.com> Cc: Lucas De Marchi <lucas.demarchi(a)intel.com> Cc: "Thomas Hellström" <thomas.hellstrom(a)linux.intel.com> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: "Christian König" <christian.koenig(a)amd.com> Cc: intel-xe(a)lists.freedesktop.org Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.8+ Signed-off-by: John Harrison <John.C.Harrison(a)Intel.com> Reviewed-by: Matthew Brost <matthew.brost(a)intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241128210824.3302147-3-John… (cherry picked from commit 90f51a7f4ec1004fc4ddfbc6d1f1068d85ef4771) Signed-off-by: Thomas Hellström <thomas.hellstrom(a)linux.intel.com> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- drivers/gpu/drm/xe/xe_devcoredump.c | 73 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index c40c91e27f71..c18e463092af 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -144,36 +144,6 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss) ss->vm = NULL; } -static void xe_devcoredump_deferred_snap_work(struct work_struct *work) -{ - struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work); - struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot); - struct xe_device *xe = coredump_to_xe(coredump); - unsigned int fw_ref; - - xe_pm_runtime_get(xe); - - /* keep going if fw fails as we still want to save the memory and SW data */ - fw_ref = xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL); - if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) - xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n"); - xe_vm_snapshot_capture_delayed(ss->vm); - xe_guc_exec_queue_snapshot_capture_delayed(ss->ge); - xe_force_wake_put(gt_to_fw(ss->gt), fw_ref); - - xe_pm_runtime_put(xe); - - /* Calculate devcoredump size */ - ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump); - - ss->read.buffer = kvmalloc(ss->read.size, GFP_USER); - if (!ss->read.buffer) - return; - - __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump); - xe_devcoredump_snapshot_free(ss); -} - static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, size_t count, void *data, size_t datalen) { @@ -222,6 +192,45 @@ static void xe_devcoredump_free(void *data) "Xe device coredump has been deleted.\n"); } +static void xe_devcoredump_deferred_snap_work(struct work_struct *work) +{ + struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work); + struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot); + struct xe_device *xe = coredump_to_xe(coredump); + unsigned int fw_ref; + + /* + * NB: Despite passing a GFP_ flags parameter here, more allocations are done + * internally using GFP_KERNEL expliictly. Hence this call must be in the worker + * thread and not in the initial capture call. + */ + dev_coredumpm_timeout(gt_to_xe(ss->gt)->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL, + xe_devcoredump_read, xe_devcoredump_free, + XE_COREDUMP_TIMEOUT_JIFFIES); + + xe_pm_runtime_get(xe); + + /* keep going if fw fails as we still want to save the memory and SW data */ + fw_ref = xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL); + if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) + xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n"); + xe_vm_snapshot_capture_delayed(ss->vm); + xe_guc_exec_queue_snapshot_capture_delayed(ss->ge); + xe_force_wake_put(gt_to_fw(ss->gt), fw_ref); + + xe_pm_runtime_put(xe); + + /* Calculate devcoredump size */ + ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump); + + ss->read.buffer = kvmalloc(ss->read.size, GFP_USER); + if (!ss->read.buffer) + return; + + __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump); + xe_devcoredump_snapshot_free(ss); +} + static void devcoredump_snapshot(struct xe_devcoredump *coredump, struct xe_sched_job *job) { @@ -305,10 +314,6 @@ void xe_devcoredump(struct xe_sched_job *job) drm_info(&xe->drm, "Xe device coredump has been created\n"); drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n", xe->drm.primary->index); - - dev_coredumpm_timeout(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL, - xe_devcoredump_read, xe_devcoredump_free, - XE_COREDUMP_TIMEOUT_JIFFIES); } static void xe_driver_devcoredump_fini(void *arg) -- 2.39.5

5 months, 1 week

1
0
0 0

Re: [PATCH v4 0/6] TEE subsystem for restricted dma-buf allocations

by Sumit Garg

Hi Lukas, On Tue, 24 Dec 2024 at 14:58, Lukas Wunner <lukas(a)wunner.de> wrote: > > On Tue, Dec 24, 2024 at 12:05:19PM +0530, Sumit Garg wrote: > > Restricted memory is a feature enforced by hardware specific firewalls > > where a particular TEE implementation governs which particular block > > of memory is accessible to a particular peripheral or a CPU running in > > a higher privileged mode than the Linux kernel. > [...] > > - Another possible use-case can be for the TEE implementation to store > > key material in a restricted buffer which is only accessible to the > > hardware crypto accelerator. > > Just a heads-up: > > For RSA sign/verify operations using rsassa-pkcs1 encoding, > the message to be signed/verified (which I understand could > be located in restricted memory) is prepended by a padding. > > The crypto subsystem does the prepending of the padding in software. > The actual signature generation/verification (which is an RSA encrypt > or decrypt operation) may be performed in hardware by a crypto > accelerator. > > Before commit 8552cb04e083 ("crypto: rsassa-pkcs1 - Copy source > data for SG list"), the kernel constructed a scatterlist > consisting of the padding on the one hand, and of the message > to be signed/verified on the other hand. I believe this worked > for use cases where the message is located in restricted memory. > > However since that commit, the kernel kmalloc's a new buffer and > copies the message to be signed/verified into it. The argument > was that although the *kernel* may be able to access the data, > the crypto accelerator may *not* be able to do so. In particular, > portions of the padding are located in the kernel's .rodata section > which is a valid virtual address on x86 but not on arm64 and > which may be inaccessible to a crypto accelerator. > > However in the case of restricted memory, the situation is exactly > the opposite: The kernel may *not* be able to access the data, > but the crypto accelerator can access it just fine. > > I did raise a concern about this to the maintainer, but to no avail: > https://lore.kernel.org/r/Z1Kym1-9ka8kGHrM@wunner.de/ Herbert's point is valid that there isn't any point for mapping restricted memory in the kernel virtual address space as any kernel access to that space can lead to platform specific hardware error scenarios. And for that reason we simply disallow dma_buf_mmap() and don't support dma_buf_vmap() for DMA-bufs holding TEE restricted memory. The only consumers for those DMA-bufs will be the DMA capable peripherals granted access permissions by the TEE implementation. IOW, kernel role here will be to just provide the DMA-buf infrastructure for buffers to be set up by TEE and then setting up DMA addresses for peripherals to access them. The hardware crypto accelerator can be one such peripheral. > > This is the alternative solution I would have preferred: > https://lore.kernel.org/r/3de5d373c86dcaa5abc36f501c1398c4fbf05f2f.17328651… > > > I am also in favour of end to end open source use-cases. But I fear > > without progressing in a step wise manner as with this proposal we > > would rather force developers to upstream all the software pieces in > > one go which will be kind of a chicken and egg situation. I am sure > > once this feature lands Mediatek folks will be interested to port > > their secure video playback patchset [3] on top of it. Similarly other > > silicon vendors like NXP, Qcom etc. will be motivated to do the same. > > The crypto use case may be easier to bring up than the video decoding > use case because you don't need to implement a huge amount of > user space code. Agree, if you already have such an existing hardware use-case then please feel free to build up on this patch-set. -Sumit > > Thanks, > > Lukas

5 months, 1 week

1
0
0 0

Re: [PATCH v4 0/6] TEE subsystem for restricted dma-buf allocations

by Dmitry Baryshkov

On Tue, Dec 24, 2024 at 10:32:41AM +0100, Lukas Wunner wrote: > On Tue, Dec 24, 2024 at 10:28:31AM +0100, Lukas Wunner wrote: > > I did raise a concern about this to the maintainer, but to no avail: > > https://lore.kernel.org/r/Z1Kym1-9ka8kGHrM@wunner.de/ > > Sorry, wrong link. This is the one I meant to copy-paste... :( > > https://lore.kernel.org/r/Z0rPxCGdD7r8HFKb@wunner.de/ Herbert asked a logical question, which got no response from your side. -- With best wishes Dmitry

5 months, 2 weeks

1
0
0 0

Re: [PATCH v4 0/6] TEE subsystem for restricted dma-buf allocations

by Sumit Garg

Hi Simona, On Wed, 18 Dec 2024 at 16:36, Simona Vetter <simona.vetter(a)ffwll.ch> wrote: > > On Tue, Dec 17, 2024 at 11:07:36AM +0100, Jens Wiklander wrote: > > Hi, > > > > This patch set allocates the restricted DMA-bufs via the TEE subsystem. > > > > The TEE subsystem handles the DMA-buf allocations since it is the TEE > > (OP-TEE, AMD-TEE, TS-TEE, or perhaps a future QCOMTEE) which sets up the > > restrictions for the memory used for the DMA-bufs. > > > > I've added a new IOCTL, TEE_IOC_RSTMEM_ALLOC, to allocate the restricted > > DMA-bufs. This IOCTL reaches the backend TEE driver, allowing it to choose > > how to allocate the restricted physical memory. > > > > TEE_IOC_RSTMEM_ALLOC takes in addition to a size and flags parameters also > > a use-case parameter. This is used by the backend TEE driver to decide on > > allocation policy and which devices should be able to access the memory. > > > > Three use-cases (Secure Video Playback, Trusted UI, and Secure Video > > Recording) has been identified so far to serve as examples of what can be > > expected. More use-cases can be added in userspace ABI, but it's up to the > > backend TEE drivers to provide the implementation. > > > > Each use-case has it's own restricted memory pool since different use-cases > > requires isolation from different parts of the system. A restricted memory > > pool can be based on a static carveout instantiated while probing the TEE > > backend driver, or dynamically allocated from CMA and made restricted as > > needed by the TEE. > > > > This can be tested on QEMU with the following steps: > > repo init -u https://github.com/jenswi-linaro/manifest.git -m qemu_v8.xml \ > > -b prototype/sdp-v4 > > repo sync -j8 > > cd build > > make toolchains -j$(nproc) > > make SPMC_AT_EL=1 all -j$(nproc) > > make SPMC_AT_EL=1 run-only > > # login and at the prompt: > > xtest --sdp-basic > > > > The SPMC_AT_EL=1 parameter configures the build with FF-A and an SPMC at > > S-EL1 inside OP-TEE. The parameter can be changed into SPMC_AT_EL=n to test > > without FF-A using the original SMC ABI instead. Please remember to do > > %rm -rf ../trusted-firmware-a/build/qemu > > for TF-A to be rebuilt properly using the new configuration. > > > > https://optee.readthedocs.io/en/latest/building/prerequisites.html > > list dependencies needed to build the above. > > > > The tests are pretty basic, mostly checking that a Trusted Application in > > the secure world can access and manipulate the memory. There are also some > > negative tests for out of bounds buffers etc. > > I think I've dropped this on earlier encrypted dma-buf discussions for > TEE, but can't find one right now ... Thanks for raising this query. > > Do we have some open source userspace for this? To my knowledge we have > two implementations of encrypted/content protected dma-buf in upstream > right now in the amd and intel gpu drivers, and unless I'm mistaken they > both have some minimal userspace supporting EXT_protected_textures: First of all to clarify the support Jens is adding here for allocating restricted shared memory allocation in TEE subsystem is meant to be generic and not specific to only secure media pipeline use-case. Then here we not only have open source test applications but rather open source firmware too (OP-TEE as a Trusted OS) [1] supporting this as a core feature where we maintain a stable and extensible ABI among the kernel and the OP-TEE core. Restricted memory is a feature enforced by hardware specific firewalls where a particular TEE implementation governs which particular block of memory is accessible to a particular peripheral or a CPU running in a higher privileged mode than the Linux kernel. There can be numeric use-cases surrounding that as follows: - Secure media pipeline where the contents gets decrypted and stored in a restricted buffer which are then accessible only to media display pipeline peripherals. - Trusted user interface where a peripheral takes input from the user and stores it in a restricted buffer which then is accessible to TEE implementation only. - Another possible use-case can be for the TEE implementation to store key material in a restricted buffer which is only accessible to the hardware crypto accelerator. I am sure there will be more use-cases related to this feature but those will only be possible once we provide a stable and extensible restricted memory interface among the Linux user-space and the secure world user-space (normally referred to as Trusted Applications). [1] https://github.com/OP-TEE/optee_os/pull/7159 > > https://github.com/KhronosGroup/OpenGL-Registry/blob/main/extensions/EXT/EX… > > It's not great, but it does just barely clear the bar in my opinion. I > guess something in gstreamer or similar video pipeline framework would > also do the job. > > Especially with the context of the uapi discussion in the v1/RFC thread I > think we need more than a bare-bones testcase to make sure this works in > actual use. Currently the TEE subsystem already supports a stable ABI for shared memory allocator among Linux user-space and secure world user-space here [2]. And the stable ABI for restricted memory is also along the same lines meant to be a vendor neutral abstraction for the user-space access. The current test cases not only test the interface but also perform regression tests too. I am also in favour of end to end open source use-cases. But I fear without progressing in a step wise manner as with this proposal we would rather force developers to upstream all the software pieces in one go which will be kind of a chicken and egg situation. I am sure once this feature lands Mediatek folks will be interested to port their secure video playback patchset [3] on top of it. Similarly other silicon vendors like NXP, Qcom etc. will be motivated to do the same. [2] https://docs.kernel.org/userspace-api/tee.html [3] https://lore.kernel.org/linux-arm-kernel/20240515112308.10171-1-yong.wu@med… -Sumit > > Cheers, Sima > > > > > Thanks, > > Jens > > > > Changes since V3: > > * Make the use_case and flags field in struct tee_shm u32's instead of > > u16's > > * Add more description for TEE_IOC_RSTMEM_ALLOC in the header file > > * Import namespace DMA_BUF in module tee, reported by lkp(a)intel.com > > * Added a note in the commit message for "optee: account for direction > > while converting parameters" why it's needed > > * Factor out dynamic restricted memory allocation from > > "optee: support restricted memory allocation" into two new commits > > "optee: FF-A: dynamic restricted memory allocation" and > > "optee: smc abi: dynamic restricted memory allocation" > > * Guard CMA usage with #ifdef CONFIG_CMA, effectively disabling dynamic > > restricted memory allocate if CMA isn't configured > > > > Changes since the V2 RFC: > > * Based on v6.12 > > * Replaced the flags for SVP and Trusted UID memory with a u32 field with > > unique id for each use case > > * Added dynamic allocation of restricted memory pools > > * Added OP-TEE ABI both with and without FF-A for dynamic restricted memory > > * Added support for FF-A with FFA_LEND > > > > Changes since the V1 RFC: > > * Based on v6.11 > > * Complete rewrite, replacing the restricted heap with TEE_IOC_RSTMEM_ALLOC > > > > Changes since Olivier's post [2]: > > * Based on Yong Wu's post [1] where much of dma-buf handling is done in > > the generic restricted heap > > * Simplifications and cleanup > > * New commit message for "dma-buf: heaps: add Linaro restricted dmabuf heap > > support" > > * Replaced the word "secure" with "restricted" where applicable > > > > Jens Wiklander (6): > > tee: add restricted memory allocation > > optee: account for direction while converting parameters > > optee: sync secure world ABI headers > > optee: support restricted memory allocation > > optee: FF-A: dynamic restricted memory allocation > > optee: smc abi: dynamic restricted memory allocation > > > > drivers/tee/Makefile | 1 + > > drivers/tee/optee/Makefile | 1 + > > drivers/tee/optee/call.c | 10 +- > > drivers/tee/optee/core.c | 1 + > > drivers/tee/optee/ffa_abi.c | 178 +++++++++++++- > > drivers/tee/optee/optee_ffa.h | 27 ++- > > drivers/tee/optee/optee_msg.h | 65 ++++- > > drivers/tee/optee/optee_private.h | 75 ++++-- > > drivers/tee/optee/optee_smc.h | 71 +++++- > > drivers/tee/optee/rpc.c | 31 ++- > > drivers/tee/optee/rstmem.c | 388 ++++++++++++++++++++++++++++++ > > drivers/tee/optee/smc_abi.c | 213 ++++++++++++++-- > > drivers/tee/tee_core.c | 38 ++- > > drivers/tee/tee_private.h | 2 + > > drivers/tee/tee_rstmem.c | 201 ++++++++++++++++ > > drivers/tee/tee_shm.c | 2 + > > drivers/tee/tee_shm_pool.c | 69 +++++- > > include/linux/tee_core.h | 15 ++ > > include/linux/tee_drv.h | 2 + > > include/uapi/linux/tee.h | 44 +++- > > 20 files changed, 1358 insertions(+), 76 deletions(-) > > create mode 100644 drivers/tee/optee/rstmem.c > > create mode 100644 drivers/tee/tee_rstmem.c > > > > > > base-commit: fac04efc5c793dccbd07e2d59af9f90b7fc0dca4 > > -- > > 2.43.0 > > > > -- > Simona Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch

5 months, 2 weeks

1
0
0 0

[PATCH v4 0/6] TEE subsystem for restricted dma-buf allocations

by Jens Wiklander

Hi, This patch set allocates the restricted DMA-bufs via the TEE subsystem. The TEE subsystem handles the DMA-buf allocations since it is the TEE (OP-TEE, AMD-TEE, TS-TEE, or perhaps a future QCOMTEE) which sets up the restrictions for the memory used for the DMA-bufs. I've added a new IOCTL, TEE_IOC_RSTMEM_ALLOC, to allocate the restricted DMA-bufs. This IOCTL reaches the backend TEE driver, allowing it to choose how to allocate the restricted physical memory. TEE_IOC_RSTMEM_ALLOC takes in addition to a size and flags parameters also a use-case parameter. This is used by the backend TEE driver to decide on allocation policy and which devices should be able to access the memory. Three use-cases (Secure Video Playback, Trusted UI, and Secure Video Recording) has been identified so far to serve as examples of what can be expected. More use-cases can be added in userspace ABI, but it's up to the backend TEE drivers to provide the implementation. Each use-case has it's own restricted memory pool since different use-cases requires isolation from different parts of the system. A restricted memory pool can be based on a static carveout instantiated while probing the TEE backend driver, or dynamically allocated from CMA and made restricted as needed by the TEE. This can be tested on QEMU with the following steps: repo init -u https://github.com/jenswi-linaro/manifest.git -m qemu_v8.xml \ -b prototype/sdp-v4 repo sync -j8 cd build make toolchains -j$(nproc) make SPMC_AT_EL=1 all -j$(nproc) make SPMC_AT_EL=1 run-only # login and at the prompt: xtest --sdp-basic The SPMC_AT_EL=1 parameter configures the build with FF-A and an SPMC at S-EL1 inside OP-TEE. The parameter can be changed into SPMC_AT_EL=n to test without FF-A using the original SMC ABI instead. Please remember to do %rm -rf ../trusted-firmware-a/build/qemu for TF-A to be rebuilt properly using the new configuration. https://optee.readthedocs.io/en/latest/building/prerequisites.html list dependencies needed to build the above. The tests are pretty basic, mostly checking that a Trusted Application in the secure world can access and manipulate the memory. There are also some negative tests for out of bounds buffers etc. Thanks, Jens Changes since V3: * Make the use_case and flags field in struct tee_shm u32's instead of u16's * Add more description for TEE_IOC_RSTMEM_ALLOC in the header file * Import namespace DMA_BUF in module tee, reported by lkp(a)intel.com * Added a note in the commit message for "optee: account for direction while converting parameters" why it's needed * Factor out dynamic restricted memory allocation from "optee: support restricted memory allocation" into two new commits "optee: FF-A: dynamic restricted memory allocation" and "optee: smc abi: dynamic restricted memory allocation" * Guard CMA usage with #ifdef CONFIG_CMA, effectively disabling dynamic restricted memory allocate if CMA isn't configured Changes since the V2 RFC: * Based on v6.12 * Replaced the flags for SVP and Trusted UID memory with a u32 field with unique id for each use case * Added dynamic allocation of restricted memory pools * Added OP-TEE ABI both with and without FF-A for dynamic restricted memory * Added support for FF-A with FFA_LEND Changes since the V1 RFC: * Based on v6.11 * Complete rewrite, replacing the restricted heap with TEE_IOC_RSTMEM_ALLOC Changes since Olivier's post [2]: * Based on Yong Wu's post [1] where much of dma-buf handling is done in the generic restricted heap * Simplifications and cleanup * New commit message for "dma-buf: heaps: add Linaro restricted dmabuf heap support" * Replaced the word "secure" with "restricted" where applicable Jens Wiklander (6): tee: add restricted memory allocation optee: account for direction while converting parameters optee: sync secure world ABI headers optee: support restricted memory allocation optee: FF-A: dynamic restricted memory allocation optee: smc abi: dynamic restricted memory allocation drivers/tee/Makefile | 1 + drivers/tee/optee/Makefile | 1 + drivers/tee/optee/call.c | 10 +- drivers/tee/optee/core.c | 1 + drivers/tee/optee/ffa_abi.c | 178 +++++++++++++- drivers/tee/optee/optee_ffa.h | 27 ++- drivers/tee/optee/optee_msg.h | 65 ++++- drivers/tee/optee/optee_private.h | 75 ++++-- drivers/tee/optee/optee_smc.h | 71 +++++- drivers/tee/optee/rpc.c | 31 ++- drivers/tee/optee/rstmem.c | 388 ++++++++++++++++++++++++++++++ drivers/tee/optee/smc_abi.c | 213 ++++++++++++++-- drivers/tee/tee_core.c | 38 ++- drivers/tee/tee_private.h | 2 + drivers/tee/tee_rstmem.c | 201 ++++++++++++++++ drivers/tee/tee_shm.c | 2 + drivers/tee/tee_shm_pool.c | 69 +++++- include/linux/tee_core.h | 15 ++ include/linux/tee_drv.h | 2 + include/uapi/linux/tee.h | 44 +++- 20 files changed, 1358 insertions(+), 76 deletions(-) create mode 100644 drivers/tee/optee/rstmem.c create mode 100644 drivers/tee/tee_rstmem.c base-commit: fac04efc5c793dccbd07e2d59af9f90b7fc0dca4 -- 2.43.0

5 months, 2 weeks

2
8
0 0

Re: [PATCH] drm/sched: Document run_job() refcount hazard

by Christian König

Am 20.12.24 um 15:51 schrieb Danilo Krummrich: > On Fri, Dec 20, 2024 at 03:11:34PM +0100, Philipp Stanner wrote: >> On Fri, 2024-12-20 at 14:25 +0100, Christian König wrote: >>> Am 20.12.24 um 14:18 schrieb Danilo Krummrich: >>>> On Fri, Dec 20, 2024 at 01:53:34PM +0100, Christian König wrote: >>>>> Am 20.12.24 um 13:45 schrieb Philipp Stanner: >>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >>>>>> b/drivers/gpu/drm/scheduler/sched_main.c >>>>>> index 7ce25281c74c..d6f8df39d848 100644 >>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>>>>> + * >>>>>> + * @sched_job: the job to run >>>>>> + * >>>>>> + * Returns: dma_fence the driver must signal once the >>>>>> hardware has >>>>>> + * completed the job ("hardware fence"). >>>>>> + * >>>>>> + * Note that the scheduler expects to 'inherit' its >>>>>> own reference to >>>>>> + * this fence from the callback. It does not invoke an >>>>>> extra >>>>>> + * dma_fence_get() on it. Consequently, this callback >>>>>> must return a >>>>>> + * fence whose refcount is at least 2: One for the >>>>>> scheduler's >>>>>> + * reference returned here, another one for the >>>>>> reference kept by the >>>>>> + * driver. >>>>> Well the driver actually doesn't need any extra reference. The >>>>> scheduler >>>>> just needs to guarantee that this reference isn't dropped before >>>>> it is >>>>> signaled. >>>> I think he means the reference the driver's fence context has to >>>> have in order >>>> to signal that thing eventually. >>> Yeah, but this is usually a weak reference. IIRC most drivers don't >>> increment the reference count for the reference they keep to signal a >>> fence. >>> >>> It's expected that the consumers of the dma_fence keep the fence >>> alive >>> at least until it is signaled. >> So are you saying that the driver having an extra reference (without >> having obtained it with dma_fence_get()) is not an issue because the >> driver is the one who will signal the fence [and then be done with it]? > It's never a "real" issue if you have multiple pointers to a reference counted > object as long as you can ensure that you hold at least one reference for the > time you have pointers to the object. Well, I'm not saying that this isn't an issue. I'm just pointing out that this is the current practice :) > But, that's bad design. For every pointer to an object a separate reference > should be taken. Yeah, completely agree. Weak references are usually a bad idea if you don't absolutely need them for something. Regards, Christian.

5 months, 2 weeks

1
0
0 0

Re: [PATCH] drm/sched: Document run_job() refcount hazard

by Christian König

Am 20.12.24 um 14:18 schrieb Danilo Krummrich: > On Fri, Dec 20, 2024 at 01:53:34PM +0100, Christian König wrote: >> Am 20.12.24 um 13:45 schrieb Philipp Stanner: >>> From: Philipp Stanner <pstanner(a)redhat.com> >>> >>> drm_sched_backend_ops.run_job() returns a dma_fence for the scheduler. >>> That fence is signalled by the driver once the hardware completed the >>> associated job. The scheduler does not increment the reference count on >>> that fence, but implicitly expects to inherit this fence from run_job(). >>> >>> This is relatively subtle and prone to misunderstandings. >>> >>> This implies that, to keep a reference for itself, a driver needs to >>> call dma_fence_get() in addition to dma_fence_init() in that callback. >>> >>> It's further complicated by the fact that the scheduler even decrements >>> the refcount in drm_sched_run_job_work() since it created a new >>> reference in drm_sched_fence_scheduled(). It does, however, still use >>> its pointer to the fence after calling dma_fence_put() - which is safe >>> because of the aforementioned new reference, but actually still violates >>> the refcounting rules. >>> >>> Improve the explanatory comment for that decrement. >>> >>> Move the call to dma_fence_put() to the position behind the last usage >>> of the fence. >>> >>> Document the necessity to increment the reference count in >>> drm_sched_backend_ops.run_job(). >>> >>> Cc: Christian König <christian.koenig(a)amd.com> >>> Cc: Tvrtko Ursulin <tursulin(a)ursulin.net> >>> Cc: Andrey Grodzovsky <andrey.grodzovsky(a)amd.com> >>> Signed-off-by: Philipp Stanner <pstanner(a)redhat.com> >>> --- >>> drivers/gpu/drm/scheduler/sched_main.c | 10 +++++++--- >>> include/drm/gpu_scheduler.h | 20 ++++++++++++++++---- >>> 2 files changed, 23 insertions(+), 7 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c >>> index 7ce25281c74c..d6f8df39d848 100644 >>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>> + * >>> + * @sched_job: the job to run >>> + * >>> + * Returns: dma_fence the driver must signal once the hardware has >>> + * completed the job ("hardware fence"). >>> + * >>> + * Note that the scheduler expects to 'inherit' its own reference to >>> + * this fence from the callback. It does not invoke an extra >>> + * dma_fence_get() on it. Consequently, this callback must return a >>> + * fence whose refcount is at least 2: One for the scheduler's >>> + * reference returned here, another one for the reference kept by the >>> + * driver. >> Well the driver actually doesn't need any extra reference. The scheduler >> just needs to guarantee that this reference isn't dropped before it is >> signaled. > I think he means the reference the driver's fence context has to have in order > to signal that thing eventually. Yeah, but this is usually a weak reference. IIRC most drivers don't increment the reference count for the reference they keep to signal a fence. It's expected that the consumers of the dma_fence keep the fence alive at least until it is signaled. That's why we have this nice warning in dma_fence_release(). On the other hand I completely agree it would be more defensive if drivers increment the reference count for the reference they keep for signaling. So if we want to document that the fence reference count should at least be 2 we somehow need to enforce this with a warning for example. Regards, Christian. > >> Regards, >> Christian. >> >>> */ >>> struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);

5 months, 2 weeks

1
0
0 0

Re: [PATCH] drm/sched: Document run_job() refcount hazard

by Christian König

Am 20.12.24 um 13:45 schrieb Philipp Stanner: > From: Philipp Stanner <pstanner(a)redhat.com> > > drm_sched_backend_ops.run_job() returns a dma_fence for the scheduler. > That fence is signalled by the driver once the hardware completed the > associated job. The scheduler does not increment the reference count on > that fence, but implicitly expects to inherit this fence from run_job(). > > This is relatively subtle and prone to misunderstandings. > > This implies that, to keep a reference for itself, a driver needs to > call dma_fence_get() in addition to dma_fence_init() in that callback. > > It's further complicated by the fact that the scheduler even decrements > the refcount in drm_sched_run_job_work() since it created a new > reference in drm_sched_fence_scheduled(). It does, however, still use > its pointer to the fence after calling dma_fence_put() - which is safe > because of the aforementioned new reference, but actually still violates > the refcounting rules. > > Improve the explanatory comment for that decrement. > > Move the call to dma_fence_put() to the position behind the last usage > of the fence. > > Document the necessity to increment the reference count in > drm_sched_backend_ops.run_job(). > > Cc: Christian König <christian.koenig(a)amd.com> > Cc: Tvrtko Ursulin <tursulin(a)ursulin.net> > Cc: Andrey Grodzovsky <andrey.grodzovsky(a)amd.com> > Signed-off-by: Philipp Stanner <pstanner(a)redhat.com> > --- > drivers/gpu/drm/scheduler/sched_main.c | 10 +++++++--- > include/drm/gpu_scheduler.h | 20 ++++++++++++++++---- > 2 files changed, 23 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 7ce25281c74c..d6f8df39d848 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -1218,15 +1218,19 @@ static void drm_sched_run_job_work(struct work_struct *w) > drm_sched_fence_scheduled(s_fence, fence); > > if (!IS_ERR_OR_NULL(fence)) { > - /* Drop for original kref_init of the fence */ > - dma_fence_put(fence); > - > r = dma_fence_add_callback(fence, &sched_job->cb, > drm_sched_job_done_cb); > if (r == -ENOENT) > drm_sched_job_done(sched_job, fence->error); > else if (r) > DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); > + > + /* > + * s_fence took a new reference to fence in the call to > + * drm_sched_fence_scheduled() above. The reference passed by > + * run_job() above is now not needed any longer. Drop it. > + */ > + dma_fence_put(fence); > } else { > drm_sched_job_done(sched_job, IS_ERR(fence) ? > PTR_ERR(fence) : 0); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 95e17504e46a..a1f5c9a14278 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -420,10 +420,22 @@ struct drm_sched_backend_ops { > struct drm_sched_entity *s_entity); > > /** > - * @run_job: Called to execute the job once all of the dependencies > - * have been resolved. This may be called multiple times, if > - * timedout_job() has happened and drm_sched_job_recovery() > - * decides to try it again. > + * @run_job: Called to execute the job once all of the dependencies > + * have been resolved. This may be called multiple times, if > + * timedout_job() has happened and drm_sched_job_recovery() decides to > + * try it again. Maybe we should improve that here as well while at it. That drm_sched_job_recovery() can call this multiple times actually goes against the dma_fence rules since drivers can't easily allocate a new HW fence. Something like "The deprecated drm_sched_job_recovery() function might call this again, but it is strongly advised to not be used because it violates dma_fence memory allocations rules." On the other hand can of course be a separate patch. > + * > + * @sched_job: the job to run > + * > + * Returns: dma_fence the driver must signal once the hardware has > + * completed the job ("hardware fence"). > + * > + * Note that the scheduler expects to 'inherit' its own reference to > + * this fence from the callback. It does not invoke an extra > + * dma_fence_get() on it. Consequently, this callback must return a > + * fence whose refcount is at least 2: One for the scheduler's > + * reference returned here, another one for the reference kept by the > + * driver. Well the driver actually doesn't need any extra reference. The scheduler just needs to guarantee that this reference isn't dropped before it is signaled. Regards, Christian. > */ > struct dma_fence *(*run_job)(struct drm_sched_job *sched_job); >

5 months, 2 weeks

1
0
0 0

[PATCH 6.1 633/772] dma-fence: Use kernels sort for merging fences

by Greg Kroah-Hartman

6.1-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. >From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 128 ++++++++++++++--------------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c index b19d0adf6086..6345062731f1 100644 --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor) } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } - - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); - - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; } - if (count == 1) { - tmp = array[0]; - goto return_tmp; - } + if (count == 0 || count == 1) + goto return_fastpath; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) + sort(array, count, sizeof(*array), fence_cmp, NULL); + + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + else + array[++j] = array[i]; } - return &result->base; + count = ++j; + + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; + } + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array); -- 2.47.1

5 months, 3 weeks

1
0
0 0

[PATCH 6.1 632/772] dma-fence: Fix reference leak on fence merge failure path

by Greg Kroah-Hartman

6.1-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c index 628af51c81af..b19d0adf6086 100644 --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } -- 2.47.1

5 months, 3 weeks

1
0
0 0

[PATCH 6.6 175/356] dma-fence: Use kernels sort for merging fences

by Greg Kroah-Hartman

6.6-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. >From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 126 +++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 66 deletions(-) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next( } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_mer struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_mer if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } + } - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); + if (count == 0 || count == 1) + goto return_fastpath; - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; - } + sort(array, count, sizeof(*array), fence_cmp, NULL); - if (count == 1) { - tmp = array[0]; - goto return_tmp; + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) + dma_fence_put(array[i]); + else + array[++j] = array[i]; } + count = ++j; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) - dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; } - return &result->base; + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array);

5 months, 3 weeks

1
0
0 0

[PATCH 6.6 174/356] dma-fence: Fix reference leak on fence merge failure path

by Greg Kroah-Hartman

6.6-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ restart: dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; }

5 months, 3 weeks

1
0
0 0

[PATCH 6.12 187/466] dma-fence: Use kernels sort for merging fences

by Greg Kroah-Hartman

6.12-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. >From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 126 +++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 66 deletions(-) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next( } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_mer struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_mer if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } + } - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); + if (count == 0 || count == 1) + goto return_fastpath; - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; - } + sort(array, count, sizeof(*array), fence_cmp, NULL); - if (count == 1) { - tmp = array[0]; - goto return_tmp; + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) + dma_fence_put(array[i]); + else + array[++j] = array[i]; } + count = ++j; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) - dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; } - return &result->base; + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array);

5 months, 3 weeks

1
0
0 0

[PATCH 6.12 186/466] dma-fence: Fix reference leak on fence merge failure path

by Greg Kroah-Hartman

6.12-stable review patch. If anyone has any objections, please let me know. ------------------ From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ restart: dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; }

5 months, 3 weeks

1
0
0 0

Patch "dma-fence: Use kernel's sort for merging fences" has been added to the 6.12-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Use kernel's sort for merging fences to the 6.12-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-use-kernel-s-sort-for-merging-fences.patch and it can be found in the queue-6.12 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From fe52c649438b8489c9456681d93a9b3de3d38263 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:50 +0000 Subject: dma-fence: Use kernel's sort for merging fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 126 +++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 66 deletions(-) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next( } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_mer struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_mer if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } + } - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); + if (count == 0 || count == 1) + goto return_fastpath; - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; - } + sort(array, count, sizeof(*array), fence_cmp, NULL); - if (count == 1) { - tmp = array[0]; - goto return_tmp; + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) + dma_fence_put(array[i]); + else + array[++j] = array[i]; } + count = ++j; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) - dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; } - return &result->base; + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array); Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.12/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.12/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.12/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

Patch "dma-fence: Fix reference leak on fence merge failure path" has been added to the 6.12-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Fix reference leak on fence merge failure path to the 6.12-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch and it can be found in the queue-6.12 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From 949291c5314009b4f6e252391edbb40fdd5d5414 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:49 +0000 Subject: dma-fence: Fix reference leak on fence merge failure path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ restart: dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.12/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.12/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.12/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

Patch "dma-fence: Use kernel's sort for merging fences" has been added to the 6.6-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Use kernel's sort for merging fences to the 6.6-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-use-kernel-s-sort-for-merging-fences.patch and it can be found in the queue-6.6 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From fe52c649438b8489c9456681d93a9b3de3d38263 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:50 +0000 Subject: dma-fence: Use kernel's sort for merging fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 126 +++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 66 deletions(-) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next( } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_mer struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_mer if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } + } - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); + if (count == 0 || count == 1) + goto return_fastpath; - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; - } + sort(array, count, sizeof(*array), fence_cmp, NULL); - if (count == 1) { - tmp = array[0]; - goto return_tmp; + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) + dma_fence_put(array[i]); + else + array[++j] = array[i]; } + count = ++j; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) - dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; } - return &result->base; + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array); Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.6/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.6/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.6/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

Patch "dma-fence: Fix reference leak on fence merge failure path" has been added to the 6.6-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Fix reference leak on fence merge failure path to the 6.6-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch and it can be found in the queue-6.6 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From 949291c5314009b4f6e252391edbb40fdd5d5414 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:49 +0000 Subject: dma-fence: Fix reference leak on fence merge failure path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ restart: dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.6/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.6/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.6/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

Patch "dma-fence: Use kernel's sort for merging fences" has been added to the 6.1-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Use kernel's sort for merging fences to the 6.1-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-use-kernel-s-sort-for-merging-fences.patch and it can be found in the queue-6.1 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From fe52c649438b8489c9456681d93a9b3de3d38263 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:50 +0000 Subject: dma-fence: Use kernel's sort for merging fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit fe52c649438b8489c9456681d93a9b3de3d38263 upstream. One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@am… is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 128 ++++++++++++++--------------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c index b19d0adf6086..6345062731f1 100644 --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -12,6 +12,7 @@ #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> +#include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * @@ -59,6 +60,25 @@ struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor) } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); + +static int fence_cmp(const void *_a, const void *_b) +{ + struct dma_fence *a = *(struct dma_fence **)_a; + struct dma_fence *b = *(struct dma_fence **)_b; + + if (a->context < b->context) + return -1; + else if (a->context > b->context) + return 1; + + if (dma_fence_is_later(b, a)) + return 1; + else if (dma_fence_is_later(a, b)) + return -1; + + return 0; +} + /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, @@ -67,8 +87,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence_array *result; struct dma_fence *tmp, **array; ktime_t timestamp; - unsigned int i; - size_t count; + int i, j, count; count = 0; timestamp = ns_to_ktime(0); @@ -96,80 +115,55 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, if (!array) return NULL; - /* - * This trashes the input fence array and uses it as position for the - * following merge loop. This works because the dma_fence_merge() - * wrapper macro is creating this temporary array on the stack together - * with the iterators. - */ - for (i = 0; i < num_fences; ++i) - fences[i] = dma_fence_unwrap_first(fences[i], &iter[i]); - count = 0; - do { - unsigned int sel; - -restart: - tmp = NULL; - for (i = 0; i < num_fences; ++i) { - struct dma_fence *next; - - while (fences[i] && dma_fence_is_signaled(fences[i])) - fences[i] = dma_fence_unwrap_next(&iter[i]); - - next = fences[i]; - if (!next) - continue; - - /* - * We can't guarantee that inpute fences are ordered by - * context, but it is still quite likely when this - * function is used multiple times. So attempt to order - * the fences by context as we pass over them and merge - * fences with the same context. - */ - if (!tmp || tmp->context > next->context) { - tmp = next; - sel = i; - - } else if (tmp->context < next->context) { - continue; - - } else if (dma_fence_is_later(tmp, next)) { - fences[i] = dma_fence_unwrap_next(&iter[i]); - goto restart; + for (i = 0; i < num_fences; ++i) { + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { + array[count++] = dma_fence_get(tmp); } else { - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - goto restart; + ktime_t t = dma_fence_timestamp(tmp); + + if (ktime_after(t, timestamp)) + timestamp = t; } } - - if (tmp) { - array[count++] = dma_fence_get(tmp); - fences[sel] = dma_fence_unwrap_next(&iter[sel]); - } - } while (tmp); - - if (count == 0) { - tmp = dma_fence_allocate_private_stub(ktime_get()); - goto return_tmp; } - if (count == 1) { - tmp = array[0]; - goto return_tmp; - } + if (count == 0 || count == 1) + goto return_fastpath; - result = dma_fence_array_create(count, array, - dma_fence_context_alloc(1), - 1, false); - if (!result) { - for (i = 0; i < count; i++) + sort(array, count, sizeof(*array), fence_cmp, NULL); + + /* + * Only keep the most recent fence for each context. + */ + j = 0; + for (i = 1; i < count; i++) { + if (array[i]->context == array[j]->context) dma_fence_put(array[i]); - tmp = NULL; - goto return_tmp; + else + array[++j] = array[i]; } - return &result->base; + count = ++j; + + if (count > 1) { + result = dma_fence_array_create(count, array, + dma_fence_context_alloc(1), + 1, false); + if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); + tmp = NULL; + goto return_tmp; + } + return &result->base; + } + +return_fastpath: + if (count == 0) + tmp = dma_fence_allocate_private_stub(timestamp); + else + tmp = array[0]; return_tmp: kfree(array); -- 2.47.1 Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.1/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.1/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.1/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

Patch "dma-fence: Fix reference leak on fence merge failure path" has been added to the 6.1-stable tree

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled dma-fence: Fix reference leak on fence merge failure path to the 6.1-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… The filename of the patch is: dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch and it can be found in the queue-6.1 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable(a)vger.kernel.org> know about it. From 949291c5314009b4f6e252391edbb40fdd5d5414 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Date: Fri, 15 Nov 2024 10:21:49 +0000 Subject: dma-fence: Fix reference leak on fence merge failure path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> commit 949291c5314009b4f6e252391edbb40fdd5d5414 upstream. Release all fence references if the output dma-fence-array could not be allocated. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Cc: Christian König <christian.koenig(a)amd.com> Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch> Cc: Sumit Semwal <sumit.semwal(a)linaro.org> Cc: Gustavo Padovan <gustavo(a)padovan.org> Cc: Friedrich Vock <friedrich.vock(a)gmx.de> Cc: linux-media(a)vger.kernel.org Cc: dri-devel(a)lists.freedesktop.org Cc: linaro-mm-sig(a)lists.linaro.org Cc: <stable(a)vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig(a)amd.com> Signed-off-by: Christian König <christian.koenig(a)amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-2-tursuli… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/dma-buf/dma-fence-unwrap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c index 628af51c81af..b19d0adf6086 100644 --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -164,6 +164,8 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, dma_fence_context_alloc(1), 1, false); if (!result) { + for (i = 0; i < count; i++) + dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } -- 2.47.1 Patches currently in stable-queue which might be from tvrtko.ursulin(a)igalia.com are queue-6.1/dma-fence-use-kernel-s-sort-for-merging-fences.patch queue-6.1/dma-buf-fix-dma_fence_array_signaled-v4.patch queue-6.1/dma-fence-fix-reference-leak-on-fence-merge-failure-path.patch

5 months, 4 weeks

1
0
0 0

[RFC 00/24] drm/msm: sparse / "VM_BIND" support

by Rob Clark

From: Rob Clark <robdclark(a)chromium.org> Conversion to DRM GPU VA Manager[1], and adding support for Vulkan Sparse Memory[2] in the form of: 1. A new VM_BIND submitqueue type for executing VM MSM_SUBMIT_BO_OP_MAP/ MAP_NULL/UNMAP commands 2. Extending the SUBMIT` ioctl to allow submitting batches of one or more MAP/MAP_NULL/UNMAP commands to a VM_BIND submitqueue The UABI takes a slightly different approach from what other drivers have done, and what would make sense if starting from a clean sheet, ie separate VM_BIND and EXEC ioctls. But since we have to maintain support for the existing SUBMIT ioctl, and because the fence, syncobj, and BO pinning is largely the same between legacy "BO-table" style SUBMIT ioctls, and new- style VM updates submitted to a VM_BIND submitqueue, I chose to go the route of extending the existing `SUBMIT` ioctl rather than adding a new ioctl. I also did not implement support for synchronous VM_BIND commands. Since userspace could just immediately wait for the `SUBMIT` to complete, I don't think we need this extra complexity in the kernel. The corresponding mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32533 ### Notes/TODOs/Open Questions: 1. The first handful of patches are from Bibek Kumar Patro's series, "iommu/arm-smmu: introduction of ACTLR implementation for Qualcomm SoCs[3], which introduces PRR (Partially-Resident-Region) support, needed to implement MAP_NULL (for Vulkan Sparse Residency[4] 2. Why do VM_BIND commands need fence fd support, instead of just syncobjs? Mainly for the benefit of virtgpu drm native context guest<->host fence passing[5], where the host VMM is operating in terms of fence fd's (syncobs are just a convenience wrapper above a dma_fence, and don't exist below the guest kernel). 3. Currently shrinker support is disabled (hence this being in Draft/RFC state). To properly support the shrinker, we need to pre-allocate various objects and pages needed for the pagetables themselves, to move memory allocations out of the fence signaling path. This short- cut was taken to unblock userspace implementation of sparse buffer/ image support. 4. Could/should we do all the vm/vma updates synchronously and defer _only_ the io-pgtable updates to the VM_BIND scheduler queue? This would simplify the previous point, in that we'd only have to pre-allocate pages for the io-pgtable updates. 5. Currently we lose support for BO dumping for devcoredump. Ideally we'd plumb `MSM_SUBMIT_BO_DUMP` flag in a `MAP` commands thru to the resulting drm_gpuva's. To do this, I think we need to extend drm_gpuva with a flags field.. the flags can be driver defined, but drm_gpuvm needs to know not to merge drm_gpuva's with different flags. This series can be found in MR form, if you prefer: https://gitlab.freedesktop.org/drm/msm/-/merge_requests/144 [1] https://www.kernel.org/doc/html/next/gpu/drm-mm.html#drm-gpuvm [2] https://docs.vulkan.org/spec/latest/chapters/sparsemem.html [3] https://patchwork.kernel.org/project/linux-arm-kernel/list/?series=909700 [4] https://docs.vulkan.org/spec/latest/chapters/sparsemem.html#sparsememory-pa… [5] https://patchew.org/linux/20231007194747.788934-1-dmitry.osipenko@collabora… Rob Clark (24): HACK: drm/msm: Disable shrinker drm/gpuvm: Don't require obj lock in destructor path drm/gpuvm: Remove bogus lock assert drm/msm: Rename msm_file_private -> msm_context drm/msm: Improve msm_context comments drm/msm: Rename msm_gem_address_space -> msm_gem_vm drm/msm: Remove vram carveout support drm/msm: Collapse vma allocation and initialization drm/msm: Collapse vma close and delete drm/msm: drm_gpuvm conversion drm/msm: Use drm_gpuvm types more drm/msm: Split submit_pin_objects() drm/msm: Lazily create context VM drm/msm: Add opt-in for VM_BIND drm/msm: Mark VM as unusable on faults drm/msm: Extend SUBMIT ioctl for VM_BIND drm/msm: Add VM_BIND submitqueue drm/msm: Add _NO_SHARE flag drm/msm: Split out helper to get iommu prot flags drm/msm: Add mmu support for non-zero offset drm/msm: Add PRR support drm/msm: Rename msm_gem_vma_purge() -> _unmap() drm/msm: Wire up gpuvm ops drm/msm: Bump UAPI version drivers/gpu/drm/drm_gpuvm.c | 10 +- drivers/gpu/drm/msm/Kconfig | 1 + drivers/gpu/drm/msm/adreno/a2xx_gpu.c | 19 +- drivers/gpu/drm/msm/adreno/a2xx_gpummu.c | 5 +- drivers/gpu/drm/msm/adreno/a3xx_gpu.c | 4 +- drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 4 +- drivers/gpu/drm/msm/adreno/a5xx_debugfs.c | 4 +- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 24 +- drivers/gpu/drm/msm/adreno/a5xx_power.c | 2 +- drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 10 +- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 32 +- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 2 +- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 51 +- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 6 +- drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 10 +- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 78 ++- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 22 +- .../drm/msm/disp/dpu1/dpu_encoder_phys_wb.c | 14 +- drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c | 18 +- drivers/gpu/drm/msm/disp/dpu1/dpu_formats.h | 2 +- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 18 +- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 14 +- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h | 4 +- drivers/gpu/drm/msm/disp/mdp4/mdp4_crtc.c | 6 +- drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c | 28 +- drivers/gpu/drm/msm/disp/mdp4/mdp4_plane.c | 12 +- drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c | 4 +- drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c | 19 +- drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 12 +- drivers/gpu/drm/msm/dsi/dsi_host.c | 14 +- drivers/gpu/drm/msm/msm_drv.c | 175 ++---- drivers/gpu/drm/msm/msm_drv.h | 31 +- drivers/gpu/drm/msm/msm_fb.c | 18 +- drivers/gpu/drm/msm/msm_fbdev.c | 2 +- drivers/gpu/drm/msm/msm_gem.c | 403 ++++++------- drivers/gpu/drm/msm/msm_gem.h | 193 +++++-- drivers/gpu/drm/msm/msm_gem_prime.c | 15 + drivers/gpu/drm/msm/msm_gem_submit.c | 223 +++++-- drivers/gpu/drm/msm/msm_gem_vma.c | 543 +++++++++++++++--- drivers/gpu/drm/msm/msm_gpu.c | 66 ++- drivers/gpu/drm/msm/msm_gpu.h | 132 +++-- drivers/gpu/drm/msm/msm_iommu.c | 84 ++- drivers/gpu/drm/msm/msm_kms.c | 14 +- drivers/gpu/drm/msm/msm_kms.h | 2 +- drivers/gpu/drm/msm/msm_mmu.h | 2 +- drivers/gpu/drm/msm/msm_ringbuffer.c | 4 +- drivers/gpu/drm/msm/msm_submitqueue.c | 86 ++- include/uapi/drm/msm_drm.h | 98 +++- 48 files changed, 1637 insertions(+), 903 deletions(-) -- 2.47.1

6 months

1
3
0 0

[PATCH v3 0/4] TEE subsystem for restricted dma-buf allocations

by Jens Wiklander

Hi, This patch set allocates the restricted DMA-bufs via the TEE subsystem. This a big update compared to the previous patch set [1]. The TEE subsystem handles the DMA-buf allocations since it is the TEE (OP-TEE, AMD-TEE, TS-TEE, or perhaps a future QTEE) which sets up the restrictions for the memory used for the DMA-bufs. I've added a new IOCTL, TEE_IOC_RSTMEM_ALLOC, to allocate the restricted DMA-bufs. This IOCTL reaches the backend TEE driver, allowing it to choose how to allocate the restricted physical memory. TEE_IOC_RSTMEM_ALLOC takes in addition to a size and flags parameters also a use-case parameter. This is used by the backend TEE driver to decide on allocation policy and which devices should be able to access the memory. Three use-cases (Secure Video Playback, Trusted UI, and Secure Video Recording) has been identified so far to serve as examples of what can be expected. More use-cases can be added in userspace ABI, but it's up to the backend TEE drivers to provide the implementation. Each use-case has it's own restricted memory pool since different use-cases requires isolation from different parts of the system. A restricted memory pool can be based on a static carveout instantiated while probing the TEE backend driver, or dynamically allocated from CMA and made restricted as needed by the TEE. This can be tested on QEMU with the following steps: repo init -u https://github.com/jenswi-linaro/manifest.git -m qemu_v8.xml \ -b prototype/sdp-v3 repo sync -j8 cd build make toolchains -j$(nproc) make SPMC_AT_EL=1 all -j$(nproc) make SPMC_AT_EL=1 run-only # login and at the prompt: xtest --sdp-basic The SPMC_AT_EL=1 parameter configures the build with FF-A and an SPMC at S-EL1 inside OP-TEE. The parameter can be changed into SPMC_AT_EL=n to test without FF-A using the original SMC ABI instead. Please remember to do %rm -rf ../trusted-firmware-a/build/qemu for TF-A to be rebuilt properly using the new configuration. https://optee.readthedocs.io/en/latest/building/prerequisites.html list dependencies needed to build the above. The tests are pretty basic, mostly checking that a Trusted Application in the secure world can access and manipulate the memory. There are also some negative tests for out of bounds buffers etc. Thanks, Jens [1] https://lore.kernel.org/lkml/20241015101716.740829-1-jens.wiklander@linaro.… Changes since the V2 RFC: * Based on v6.12 * Replaced the flags for SVP and Trusted UID memory with a u32 field with unique id for each use case * Added dynamic allocation of restricted memory pools * Added OP-TEE ABI both with and without FF-A for dynamic restricted memory * Added support for FF-A with FFA_LEND Changes since the V1 RFC: * Based on v6.11 * Complete rewrite, replacing the restricted heap with TEE_IOC_RSTMEM_ALLOC Changes since Olivier's post [2]: * Based on Yong Wu's post [1] where much of dma-buf handling is done in the generic restricted heap * Simplifications and cleanup * New commit message for "dma-buf: heaps: add Linaro restricted dmabuf heap support" * Replaced the word "secure" with "restricted" where applicable Jens Wiklander (4): tee: add restricted memory allocation optee: account for direction while converting parameters optee: sync secure world ABI headers optee: support restricted memory allocation drivers/tee/Makefile | 1 + drivers/tee/optee/Makefile | 1 + drivers/tee/optee/call.c | 10 +- drivers/tee/optee/core.c | 1 + drivers/tee/optee/ffa_abi.c | 178 +++++++++++++- drivers/tee/optee/optee_ffa.h | 27 ++- drivers/tee/optee/optee_msg.h | 65 ++++- drivers/tee/optee/optee_private.h | 75 ++++-- drivers/tee/optee/optee_smc.h | 71 +++++- drivers/tee/optee/rpc.c | 31 ++- drivers/tee/optee/rstmem.c | 380 ++++++++++++++++++++++++++++++ drivers/tee/optee/smc_abi.c | 214 +++++++++++++++-- drivers/tee/tee_core.c | 37 ++- drivers/tee/tee_private.h | 2 + drivers/tee/tee_rstmem.c | 201 ++++++++++++++++ drivers/tee/tee_shm.c | 2 + drivers/tee/tee_shm_pool.c | 69 +++++- include/linux/tee_core.h | 15 ++ include/linux/tee_drv.h | 4 +- include/uapi/linux/tee.h | 37 ++- 20 files changed, 1344 insertions(+), 77 deletions(-) create mode 100644 drivers/tee/optee/rstmem.c create mode 100644 drivers/tee/tee_rstmem.c -- 2.43.0

6 months

2
12
0 0

Re: [PATCH v3 1/3] dmaengine: qcom: gpi: Add GPI Block event interrupt support

by Vinod Koul

On 21-11-24, 18:31, Jyothi Kumar Seerapu wrote: > GSI hardware generates an interrupt for each transfer completion. > For multiple messages within a single transfer, this results in > N interrupts for N messages, leading to significant software > interrupt latency. > > To mitigate this latency, utilize Block Event Interrupt (BEI) mechanism. > Enabling BEI instructs the GSI hardware to prevent interrupt generation > and BEI is disabled when an interrupt is necessary. > > When using BEI, consider splitting a single multi-message transfer into > chunks of 8 internally. Interrupts are not expected for the first 7 message > completions, only the last message triggers an interrupt,indicating > the completion of 8 messages. > > This BEI mechanism enhances overall transfer efficiency. > > Signed-off-by: Jyothi Kumar Seerapu <quic_jseerapu(a)quicinc.com> > --- > > v2-> v3: > - Renamed gpi_multi_desc_process to gpi_multi_xfer_timeout_handler > - MIN_NUM_OF_MSGS_MULTI_DESC changed from 4 to 2 > - Added documentation for newly added changes in "qcom-gpi-dma.h" file > - Updated commit description. > > v1 -> v2: > - Changed dma_addr type from array of pointers to array. > - To support BEI functionality with the TRE size of 64 defined in GPI driver, > updated QCOM_GPI_MAX_NUM_MSGS to 16 and NUM_MSGS_PER_IRQ to 4. > > drivers/dma/qcom/gpi.c | 48 ++++++++++++++++++++ > include/linux/dma/qcom-gpi-dma.h | 76 ++++++++++++++++++++++++++++++++ > 2 files changed, 124 insertions(+) > > diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c > index 52a7c8f2498f..5442b65b1638 100644 > --- a/drivers/dma/qcom/gpi.c > +++ b/drivers/dma/qcom/gpi.c > @@ -1693,6 +1693,9 @@ static int gpi_create_i2c_tre(struct gchan *chan, struct gpi_desc *desc, > > tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE); > tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT); > + > + if (i2c->flags & QCOM_GPI_BLOCK_EVENT_IRQ) > + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_BEI); > } > > for (i = 0; i < tre_idx; i++) > @@ -2098,6 +2101,51 @@ static int gpi_find_avail_gpii(struct gpi_dev *gpi_dev, u32 seid) > return -EIO; > } > > +/** > + * gpi_multi_xfer_timeout_handler() - Handle multi message transfer timeout > + * @dev: pointer to the corresponding dev node > + * @multi_xfer: pointer to the gpi_multi_xfer > + * @num_xfers: total number of transfers > + * @transfer_timeout_msecs: transfer timeout value > + * @transfer_comp: completion object of the transfer > + * > + * This function is used to wait for the processed transfers based on > + * the interrupts generated upon transfer completion. > + * Return: On success returns 0, otherwise return error code (-ETIMEDOUT) > + */ > +int gpi_multi_xfer_timeout_handler(struct device *dev, struct gpi_multi_xfer *multi_xfer, > + u32 num_xfers, u32 transfer_timeout_msecs, > + struct completion *transfer_comp) > +{ > + int i; > + u32 max_irq_cnt, time_left; > + > + max_irq_cnt = num_xfers / NUM_MSGS_PER_IRQ; > + if (num_xfers % NUM_MSGS_PER_IRQ) > + max_irq_cnt++; > + > + /* > + * Wait for the interrupts of the processed transfers in multiple > + * of 8 and for the last transfer. If the hardware is fast and > + * already processed all the transfers then no need to wait. > + */ > + for (i = 0; i < max_irq_cnt; i++) { > + reinit_completion(transfer_comp); > + if (max_irq_cnt != multi_xfer->irq_cnt) { > + time_left = wait_for_completion_timeout(transfer_comp, > + transfer_timeout_msecs); > + if (!time_left) { > + dev_err(dev, "%s: Transfer timeout\n", __func__); > + return -ETIMEDOUT; > + } > + } > + if (num_xfers > multi_xfer->msg_idx_cnt) > + return 0; > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(gpi_multi_xfer_timeout_handler); > + > /* gpi_of_dma_xlate: open client requested channel */ > static struct dma_chan *gpi_of_dma_xlate(struct of_phandle_args *args, > struct of_dma *of_dma) > diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h > index 6680dd1a43c6..f001a8ac1887 100644 > --- a/include/linux/dma/qcom-gpi-dma.h > +++ b/include/linux/dma/qcom-gpi-dma.h > @@ -15,6 +15,38 @@ enum spi_transfer_cmd { > SPI_DUPLEX, > }; > > +/** > + * define QCOM_GPI_BLOCK_EVENT_IRQ - Block event interrupt support > + * > + * This is used to enable/disable the Block event interrupt mechanism. > + */ > +#define QCOM_GPI_BLOCK_EVENT_IRQ BIT(0) > + > +/** > + * define QCOM_GPI_MAX_NUM_MSGS - maximum number of messages support > + * > + * This indicates maximum number of messages can allocate and > + * submit to hardware. To handle more messages beyond this, > + * need to unmap the processed messages. > + */ > +#define QCOM_GPI_MAX_NUM_MSGS 16 > + > +/** > + * define NUM_MSGS_PER_IRQ - interrupt per messages completion > + * > + * This indicates that trigger an interrupt, after the completion of 8 messages. > + */ > +#define NUM_MSGS_PER_IRQ 8 > + > +/** > + * define MIN_NUM_OF_MSGS_MULTI_DESC - \ > + * minimum number of messages to support Block evenet interrupt > + * > + * This indicates minimum number of messages in a trenafer required to > + * process it using block event interrupt mechanism. > + */ > +#define MIN_NUM_OF_MSGS_MULTI_DESC 2 > + > /** > * struct gpi_spi_config - spi config for peripheral > * > @@ -51,6 +83,29 @@ enum i2c_op { > I2C_READ, > }; why should these be exposed to user? > > +/** > + * struct gpi_multi_xfer - Used for multi transfer support > + * > + * @msg_idx_cnt: message index for the transfer > + * @buf_idx: dma buffer index > + * @unmap_msg_cnt: unmapped transfer index > + * @freed_msg_cnt: freed transfer index > + * @irq_cnt: received interrupt count > + * @irq_msg_cnt: transfer message count for the received irqs > + * @dma_buf: virtual addresses of the buffers > + * @dma_addr: dma addresses of the buffers > + */ > +struct gpi_multi_xfer { > + u32 msg_idx_cnt; > + u32 buf_idx; > + u32 unmap_msg_cnt; > + u32 freed_msg_cnt; > + u32 irq_cnt; > + u32 irq_msg_cnt; > + void *dma_buf[QCOM_GPI_MAX_NUM_MSGS]; > + dma_addr_t dma_addr[QCOM_GPI_MAX_NUM_MSGS]; > +}; DMAengine API can do multiple transfers and we already have flags for interrupts, pls use that instead of usual behaviour of defining custom interfaces to handle everything. That is not recommended > + > /** > * struct gpi_i2c_config - i2c config for peripheral > * > @@ -65,6 +120,8 @@ enum i2c_op { > * @rx_len: receive length for buffer > * @op: i2c cmd > * @muli-msg: is part of multi i2c r-w msgs > + * @flags: true for block event interrupt support > + * @multi_xfer: indicates transfer has multi messages > */ > struct gpi_i2c_config { > u8 set_config; > @@ -78,6 +135,25 @@ struct gpi_i2c_config { > u32 rx_len; > enum i2c_op op; > bool multi_msg; > + u8 flags; > + struct gpi_multi_xfer multi_xfer; > }; > > +/** > + * gpi_multi_timeout_handler() - Handle multi message transfer timeout > + * @dev: pointer to the corresponding dev node > + * @multi_xfer: pointer to the gpi_multi_xfer > + * @num_xfers: total number of transfers > + * @transfer_timeout_msecs: transfer timeout value > + * @transfer_comp: completion object of the transfer > + * > + * This function is used to wait for the processed transfers based on > + * the interrupts generated upon transfer completion. > + * > + * Return: On success returns 0, otherwise return error code (-ETIMEDOUT) > + */ > +int gpi_multi_xfer_timeout_handler(struct device *dev, struct gpi_multi_xfer *multi_xfer, > + u32 num_xfers, u32 tranfer_timeout_msecs, > + struct completion *transfer_comp); Why should a handler be here? -- ~Vinod

6 months, 1 week

1
0
0 0

[PATCH AUTOSEL 6.11 08/87] udmabuf: change folios array from kmalloc to kvmalloc

by Sasha Levin

From: Huan Yang <link(a)vivo.com> [ Upstream commit 1c0844c6184e658064e14c4335885785ad3bf84b ] When PAGE_SIZE 4096, MAX_PAGE_ORDER 10, 64bit machine, page_alloc only support 4MB. If above this, trigger this warn and return NULL. udmabuf can change size limit, if change it to 3072(3GB), and then alloc 3GB udmabuf, will fail create. [ 4080.876581] ------------[ cut here ]------------ [ 4080.876843] WARNING: CPU: 3 PID: 2015 at mm/page_alloc.c:4556 __alloc_pages+0x2c8/0x350 [ 4080.878839] RIP: 0010:__alloc_pages+0x2c8/0x350 [ 4080.879470] Call Trace: [ 4080.879473] <TASK> [ 4080.879473] ? __alloc_pages+0x2c8/0x350 [ 4080.879475] ? __warn.cold+0x8e/0xe8 [ 4080.880647] ? __alloc_pages+0x2c8/0x350 [ 4080.880909] ? report_bug+0xff/0x140 [ 4080.881175] ? handle_bug+0x3c/0x80 [ 4080.881556] ? exc_invalid_op+0x17/0x70 [ 4080.881559] ? asm_exc_invalid_op+0x1a/0x20 [ 4080.882077] ? udmabuf_create+0x131/0x400 Because MAX_PAGE_ORDER, kmalloc can max alloc 4096 * (1 << 10), 4MB memory, each array entry is pointer(8byte), so can save 524288 pages(2GB). Further more, costly order(order 3) may not be guaranteed that it can be applied for, due to fragmentation. This patch change udmabuf array use kvmalloc_array, this can fallback alloc into vmalloc, which can guarantee allocation for any size and does not affect the performance of kmalloc allocations. Signed-off-by: Huan Yang <link(a)vivo.com> Acked-by: Christian König <christian.koenig(a)amd.com> Acked-by: Vivek Kasireddy <vivek.kasireddy(a)intel.com> Signed-off-by: Vivek Kasireddy <vivek.kasireddy(a)intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240918025238.2957823-3-link… Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- drivers/dma-buf/udmabuf.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 047c3cd2cefff..bc94c194e172d 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -80,7 +80,7 @@ static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) dma_resv_assert_held(buf->resv); - pages = kmalloc_array(ubuf->pagecount, sizeof(*pages), GFP_KERNEL); + pages = kvmalloc_array(ubuf->pagecount, sizeof(*pages), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -88,7 +88,7 @@ static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) pages[pg] = &ubuf->folios[pg]->page; vaddr = vm_map_ram(pages, ubuf->pagecount, -1); - kfree(pages); + kvfree(pages); if (!vaddr) return -EINVAL; @@ -196,8 +196,8 @@ static void release_udmabuf(struct dma_buf *buf) put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL); unpin_all_folios(&ubuf->unpin_list); - kfree(ubuf->offsets); - kfree(ubuf->folios); + kvfree(ubuf->offsets); + kvfree(ubuf->folios); kfree(ubuf); } @@ -322,14 +322,14 @@ static long udmabuf_create(struct miscdevice *device, if (!ubuf->pagecount) goto err; - ubuf->folios = kmalloc_array(ubuf->pagecount, sizeof(*ubuf->folios), - GFP_KERNEL); + ubuf->folios = kvmalloc_array(ubuf->pagecount, sizeof(*ubuf->folios), + GFP_KERNEL); if (!ubuf->folios) { ret = -ENOMEM; goto err; } - ubuf->offsets = kcalloc(ubuf->pagecount, sizeof(*ubuf->offsets), - GFP_KERNEL); + ubuf->offsets = kvcalloc(ubuf->pagecount, sizeof(*ubuf->offsets), + GFP_KERNEL); if (!ubuf->offsets) { ret = -ENOMEM; goto err; @@ -343,7 +343,7 @@ static long udmabuf_create(struct miscdevice *device, goto err; pgcnt = list[i].size >> PAGE_SHIFT; - folios = kmalloc_array(pgcnt, sizeof(*folios), GFP_KERNEL); + folios = kvmalloc_array(pgcnt, sizeof(*folios), GFP_KERNEL); if (!folios) { ret = -ENOMEM; goto err; @@ -353,7 +353,7 @@ static long udmabuf_create(struct miscdevice *device, ret = memfd_pin_folios(memfd, list[i].offset, end, folios, pgcnt, &pgoff); if (ret <= 0) { - kfree(folios); + kvfree(folios); if (!ret) ret = -EINVAL; goto err; @@ -382,7 +382,7 @@ static long udmabuf_create(struct miscdevice *device, } } - kfree(folios); + kvfree(folios); fput(memfd); memfd = NULL; } @@ -398,8 +398,8 @@ static long udmabuf_create(struct miscdevice *device, if (memfd) fput(memfd); unpin_all_folios(&ubuf->unpin_list); - kfree(ubuf->offsets); - kfree(ubuf->folios); + kvfree(ubuf->offsets); + kvfree(ubuf->folios); kfree(ubuf); return ret; } -- 2.43.0

6 months, 2 weeks

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

Linaro-mm-sig