On 1/14/26 2:46 AM, Tomeu Vizoso wrote:
> This memory region is used by the DRM/accel driver to allocate addresses
> for buffers that are used for communication with the DSP cores and for
> their intermediate results.
>
> Signed-off-by: Tomeu Vizoso <tomeu(a)tomeuvizoso.net>
> ---
> arch/arm64/boot/dts/ti/k3-j722s-ti-ipc-firmware.dtsi | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/boot/dts/ti/k3-j722s-ti-ipc-firmware.dtsi b/arch/arm64/boot/dts/ti/k3-j722s-ti-ipc-firmware.dtsi
> index 3fbff927c4c08bce741555aa2753a394b751144f..b80d2a5a157ad59eaed8e57b22f1f4bce4765a85 100644
> --- a/arch/arm64/boot/dts/ti/k3-j722s-ti-ipc-firmware.dtsi
> +++ b/arch/arm64/boot/dts/ti/k3-j722s-ti-ipc-firmware.dtsi
> @@ -42,6 +42,11 @@ c7x_0_memory_region: memory@a3100000 {
> no-map;
> };
>
> + c7x_iova_pool: iommu-pool@a7000000 {
> + reg = <0x00 0xa7000000 0x00 0x18200000>;
> + no-map;
Could you expand on why this carveout is needed? The C7 NPU has a full
MMU and should be able to work with any buffer Linux allocates from any
address, even non-contiguous buffers too.
Communication should already happen over the existing RPMSG channels
without needing extra buffers. And space for intermediate results
should be provided dynamically by the drivers (I believe that would
match how GPUs without dedicated memory handle getting intermediate
buffers space from system memory these days, but do correct me if
I'm wrong about that one).
Andrew
> + };
> +
> c7x_1_dma_memory_region: memory@a4000000 {
> compatible = "shared-dma-pool";
> reg = <0x00 0xa4000000 0x00 0x100000>;
> @@ -151,13 +156,15 @@ &main_r5fss0_core0 {
> &c7x_0 {
> mboxes = <&mailbox0_cluster2 &mbox_c7x_0>;
> memory-region = <&c7x_0_dma_memory_region>,
> - <&c7x_0_memory_region>;
> + <&c7x_0_memory_region>,
> + <&c7x_iova_pool>;
> status = "okay";
> };
>
> &c7x_1 {
> mboxes = <&mailbox0_cluster3 &mbox_c7x_1>;
> memory-region = <&c7x_1_dma_memory_region>,
> - <&c7x_1_memory_region>;
> + <&c7x_1_memory_region>,
> + <&c7x_iova_pool>;
> status = "okay";
> };
>
On Tue, 13 Jan 2026, Tomeu Vizoso <tomeu(a)tomeuvizoso.net> wrote:
> +#include "linux/dev_printk.h"
Random drive-by comment, please use <> instead of "" for include/
headers.
> +#include <drm/drm_file.h>
> +#include <drm/drm_gem.h>
> +#include <drm/drm_print.h>
> +#include <drm/thames_accel.h>
> +#include <linux/platform_device.h>
In general, I think it will make everyone's life easier in the long run
if the include directives are grouped and sorted.
BR,
Jani.
--
Jani Nikula, Intel
On 1/14/26 10:53, Tvrtko Ursulin wrote:
> \
> On 13/01/2026 15:16, Christian König wrote:
>> Some driver use fence->ops to test if a fence was initialized or not.
>> The problem is that this utilizes internal behavior of the dma_fence
>> implementation.
>>
>> So better abstract that into a function.
>>
>> Signed-off-by: Christian König <christian.koenig(a)amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 +++++++------
>> drivers/gpu/drm/qxl/qxl_release.c | 2 +-
>> include/linux/dma-fence.h | 12 ++++++++++++
>> 3 files changed, 20 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 0a0dcbf0798d..b97f90bbe8b9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -278,9 +278,10 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>> unsigned i;
>> /* Check if any fences were initialized */
>> - if (job->base.s_fence && job->base.s_fence->finished.ops)
>> + if (job->base.s_fence &&
>> + dma_fence_is_initialized(&job->base.s_fence->finished))
>> f = &job->base.s_fence->finished;
>> - else if (job->hw_fence && job->hw_fence->base.ops)
>> + else if (dma_fence_is_initialized(&job->hw_fence->base))
>> f = &job->hw_fence->base;
>> else
>> f = NULL;
>> @@ -297,11 +298,11 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>> amdgpu_sync_free(&job->explicit_sync);
>> - if (job->hw_fence->base.ops)
>> + if (dma_fence_is_initialized(&job->hw_fence->base))
>> dma_fence_put(&job->hw_fence->base);
>> else
>> kfree(job->hw_fence);
>> - if (job->hw_vm_fence->base.ops)
>> + if (dma_fence_is_initialized(&job->hw_vm_fence->base))
>> dma_fence_put(&job->hw_vm_fence->base);
>> else
>> kfree(job->hw_vm_fence);
>> @@ -335,11 +336,11 @@ void amdgpu_job_free(struct amdgpu_job *job)
>> if (job->gang_submit != &job->base.s_fence->scheduled)
>> dma_fence_put(job->gang_submit);
>> - if (job->hw_fence->base.ops)
>> + if (dma_fence_is_initialized(&job->hw_fence->base))
>> dma_fence_put(&job->hw_fence->base);
>> else
>> kfree(job->hw_fence);
>> - if (job->hw_vm_fence->base.ops)
>> + if (dma_fence_is_initialized(&job->hw_vm_fence->base))
>> dma_fence_put(&job->hw_vm_fence->base);
>> else
>> kfree(job->hw_vm_fence);
>> diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
>> index 7b3c9a6016db..b38ae0b25f3c 100644
>> --- a/drivers/gpu/drm/qxl/qxl_release.c
>> +++ b/drivers/gpu/drm/qxl/qxl_release.c
>> @@ -146,7 +146,7 @@ qxl_release_free(struct qxl_device *qdev,
>> idr_remove(&qdev->release_idr, release->id);
>> spin_unlock(&qdev->release_idr_lock);
>> - if (release->base.ops) {
>> + if (dma_fence_is_initialized(&release->base)) {
>> WARN_ON(list_empty(&release->bos));
>> qxl_release_free_list(release);
>> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
>> index eea674acdfa6..371aa8ecf18e 100644
>> --- a/include/linux/dma-fence.h
>> +++ b/include/linux/dma-fence.h
>> @@ -274,6 +274,18 @@ void dma_fence_release(struct kref *kref);
>> void dma_fence_free(struct dma_fence *fence);
>> void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq);
>> +/**
>> + * dma_fence_is_initialized - test if fence was initialized
>> + * @fence: fence to test
>> + *
>> + * Return: True if fence was initialized, false otherwise. Works correctly only
>> + * when memory backing the fence structure is zero initialized on allocation.
>> + */
>> +static inline bool dma_fence_is_initialized(struct dma_fence *fence)
>> +{
>> + return fence && !!fence->ops;
>
> This patch should precede the one adding RCU protection to fence->ops. And that one then needs to add a rcu_dereference() here.
Good point.
> At which point however it would start exploding?
When we start setting the ops pointer to NULL in the next patch.
> Which also means the new API is racy by definition and can give false positives if fence would be to be signaled as someone is checking.
Oh, that is a really really good point. I haven't thought about that because all current users would check the fence only after it is signaled.
> Hmm.. is the new API too weak, being able to only be called under very limited circumstances?
Yes, exactly that. All callers use this only to decide on the correct cleanup path.
So the fence is either fully signaled or was never initialized in the first place.
> Would it be better to solve it in the drivers by tracking state?
The alternative I had in mind was to use another DMA_FENCE_FLAG_... for that.
I will probably use that approach instead, just to make it extra defensive.
Thanks,
Christian.
>
> Regards,
>
> Tvrtko
>
>> +}
>> +
>> /**
>> * dma_fence_put - decreases refcount of the fence
>> * @fence: fence to reduce refcount of
>
On Wed, Oct 29, 2025 at 07:07:42PM +0100, Neil Armstrong wrote:
> The I2C Hub controller is a simpler GENI I2C variant that doesn't
> support DMA at all, add a no_dma flag to make sure it nevers selects
> the SE DMA mode with mappable 32bytes long transfers.
>
> Fixes: cacd9643eca7 ("i2c: qcom-geni: add support for I2C Master Hub variant")
> Signed-off-by: Neil Armstrong <neil.armstrong(a)linaro.org>
> Reviewed-by: Konrad Dybcio <konrad.dybcio(a)oss.qualcomm.com>
> Reviewed-by: Mukesh Kumar Savaliya <mukesh.savaliya(a)oss.qualcomm.com>>
Applied to for-current, thanks!
On 1/13/26 17:12, Philipp Stanner wrote:
> On Tue, 2026-01-13 at 16:16 +0100, Christian König wrote:
>> Using the inline lock is now the recommended way for dma_fence implementations.
>>
>> For the scheduler fence use the inline lock for the scheduled fence part
>> and then the lock from the scheduled fence as external lock for the finished fence.
>>
>> This way there is no functional difference, except for saving the space
>> for the separate lock.
>>
>> v2: re-work the patch to avoid any functional difference
>
> *cough cough*
>
>>
>> Signed-off-by: Christian König <christian.koenig(a)amd.com>
>> ---
>> drivers/gpu/drm/scheduler/sched_fence.c | 6 +++---
>> include/drm/gpu_scheduler.h | 4 ----
>> 2 files changed, 3 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
>> index 724d77694246..112677231f9a 100644
>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>> @@ -217,7 +217,6 @@ struct drm_sched_fence *drm_sched_fence_alloc(struct drm_sched_entity *entity,
>>
>> fence->owner = owner;
>> fence->drm_client_id = drm_client_id;
>> - spin_lock_init(&fence->lock);
>>
>> return fence;
>> }
>> @@ -230,9 +229,10 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
>> fence->sched = entity->rq->sched;
>> seq = atomic_inc_return(&entity->fence_seq);
>> dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
>> - &fence->lock, entity->fence_context, seq);
>> + NULL, entity->fence_context, seq);
>> dma_fence_init(&fence->finished, &drm_sched_fence_ops_finished,
>> - &fence->lock, entity->fence_context + 1, seq);
>> + dma_fence_spinlock(&fence->scheduled),
>
> I think while you are correct that this is no functional difference, it
> is still a bad idea which violates the entire idea of your series:
>
> All fences are now independent from each other and the fence context –
> except for those two.
>
> Some fences are more equal than others ;)
Yeah, I was going back and forth once more if I should keep this patch at all or just drop it.
> By implementing this, you would also show to people browsing the code
> that it can be a good idea or can be done to have fences share locks.
> Do you want that?
Good question. For almost all cases we don't want this, but once more the scheduler is special.
In the scheduler we have two fences in one, the scheduled one and the finished one.
So here it technically makes sense to have this construct to be defensive.
But on the other hand it has no practical value because it still doesn't allow us to unload the scheduler module. We would need a much wider rework for being able to do that.
So maybe I should just really drop this patch or at least keep it back until we had time to figure out what the next steps are.
> As far as I have learned from you and our discussions, that would be a
> very bombastic violation of the sacred "dma-fence-rules".
Well using the inline fence is "only" a strong recommendation. It's not as heavy as the signaling rules because when you mess up those you can easily kill the whole system.
> I believe it's definitely worth sacrificing some bytes so that those
> two fences get fully decoupled. Who will have it on their radar that
> they are special? Think about future reworks.
This doesn't even save any bytes, my thinking was more that this is the more defensive approach should anybody use the spinlock pointer from the scheduler fence to do some locking.
> Besides that, no objections from my side.
Thanks,
Christian.
>
>
> P.
>
>> + entity->fence_context + 1, seq);
>> }
>>
>> module_init(drm_sched_fence_slab_init);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 78e07c2507c7..ad3704685163 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -297,10 +297,6 @@ struct drm_sched_fence {
>> * belongs to.
>> */
>> struct drm_gpu_scheduler *sched;
>> - /**
>> - * @lock: the lock used by the scheduled and the finished fences.
>> - */
>> - spinlock_t lock;
>> /**
>> * @owner: job owner for debugging
>> */
>
On 1/13/26 22:32, Eric Chanudet wrote:
> The system dma-buf heap lets userspace allocate buffers from the page
> allocator. However, these allocations are not accounted for in memcg,
> allowing processes to escape limits that may be configured.
>
> Pass __GFP_ACCOUNT for system heap allocations, based on the
> dma_heap.mem_accounting parameter, to use memcg and account for them.
>
> Signed-off-by: Eric Chanudet <echanude(a)redhat.com>
> ---
> drivers/dma-buf/heaps/system_heap.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index 4c782fe33fd497a74eb5065797259576f9b651b6..139b50df64ed4c4a6fdd69f25fe48324fbe2c481 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -52,6 +52,8 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP};
> static const unsigned int orders[] = {8, 4, 0};
> #define NUM_ORDERS ARRAY_SIZE(orders)
>
> +extern bool mem_accounting;
Please define that in some header. Apart from that looks good technically.
But after the discussion it sounds more and more like we don't want to account device driver allocated memory in memcg at all.
Regards,
Christian.
> +
> static int dup_sg_table(struct sg_table *from, struct sg_table *to)
> {
> struct scatterlist *sg, *new_sg;
> @@ -320,14 +322,17 @@ static struct page *alloc_largest_available(unsigned long size,
> {
> struct page *page;
> int i;
> + gfp_t flags;
>
> for (i = 0; i < NUM_ORDERS; i++) {
> if (size < (PAGE_SIZE << orders[i]))
> continue;
> if (max_order < orders[i])
> continue;
> -
> - page = alloc_pages(order_flags[i], orders[i]);
> + flags = order_flags[i];
> + if (mem_accounting)
> + flags |= __GFP_ACCOUNT;
> + page = alloc_pages(flags, orders[i]);
> if (!page)
> continue;
> return page;
>