Recently, we noticed an issue where a process went into direct reclaim while holding the kernfs rw semaphore for sysfs in write(exclusive) mode. This caused processes who were doing DMA-BUF exports and releases to go into uninterruptible sleep since they needed to acquire the same semaphore for the DMA-BUF sysfs entry creation/deletion. In order to avoid blocking DMA-BUF export/release for an indeterminate amount of time while another process is holding the sysfs rw semaphore in exclusive mode, this patch moves the per-buffer sysfs file creation/deleteion to a kthread.
Fixes: bdb8d06dfefd ("dmabuf: Add the capability to expose DMA-BUF stats in sysfs") Signed-off-by: Hridya Valsaraju hridya@google.com --- drivers/dma-buf/dma-buf-sysfs-stats.c | 343 ++++++++++++++++++++++++-- include/linux/dma-buf.h | 46 ++++ 2 files changed, 366 insertions(+), 23 deletions(-)
diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 053baadcada9..3251fdf2f05f 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -7,13 +7,39 @@
#include <linux/dma-buf.h> #include <linux/dma-resv.h> +#include <linux/freezer.h> #include <linux/kobject.h> +#include <linux/kthread.h> +#include <linux/list.h> #include <linux/printk.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/sysfs.h>
#include "dma-buf-sysfs-stats.h"
+struct dmabuf_kobj_work { + struct list_head list; + struct dma_buf_sysfs_entry *sysfs_entry; + struct dma_buf_sysfs_entry_metadata *sysfs_metadata; + unsigned long uid; +}; + +/* Both kobject setup and teardown work gets queued on the list. */ +static LIST_HEAD(dmabuf_kobj_work_list); + +/* dmabuf_kobj_list_lock protects dmabuf_kobj_work_list. */ +static DEFINE_SPINLOCK(dmabuf_kobj_list_lock); + +/* + * dmabuf_sysfs_show_lock prevents a race between a DMA-BUF sysfs file being + * read and the DMA-BUF being freed by protecting sysfs_entry->dmabuf. + */ +static DEFINE_SPINLOCK(dmabuf_sysfs_show_lock); + +static struct task_struct *dmabuf_kobject_task; +static wait_queue_head_t dmabuf_kobject_waitqueue; + #define to_dma_buf_entry_from_kobj(x) container_of(x, struct dma_buf_sysfs_entry, kobj)
/** @@ -64,15 +90,26 @@ static ssize_t dma_buf_stats_attribute_show(struct kobject *kobj, struct dma_buf_stats_attribute *attribute; struct dma_buf_sysfs_entry *sysfs_entry; struct dma_buf *dmabuf; + int ret;
attribute = to_dma_buf_stats_attr(attr); sysfs_entry = to_dma_buf_entry_from_kobj(kobj); + + /* + * acquire dmabuf_sysfs_show_lock to prevent a race with the DMA-BUF + * being freed while sysfs_entry->dmabuf is being accessed. + */ + spin_lock(&dmabuf_sysfs_show_lock); dmabuf = sysfs_entry->dmabuf;
- if (!dmabuf || !attribute->show) + if (!dmabuf || !attribute->show) { + spin_unlock(&dmabuf_sysfs_show_lock); return -EIO; + }
- return attribute->show(dmabuf, attribute, buf); + ret = attribute->show(dmabuf, attribute, buf); + spin_unlock(&dmabuf_sysfs_show_lock); + return ret; }
static const struct sysfs_ops dma_buf_stats_sysfs_ops = { @@ -118,33 +155,275 @@ static struct kobj_type dma_buf_ktype = { .default_groups = dma_buf_stats_default_groups, };
-void dma_buf_stats_teardown(struct dma_buf *dmabuf) +/* Statistics files do not need to send uevents. */ +static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj) { - struct dma_buf_sysfs_entry *sysfs_entry; + return 0; +}
- sysfs_entry = dmabuf->sysfs_entry; - if (!sysfs_entry) - return; +static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = { + .filter = dmabuf_sysfs_uevent_filter, +}; + +/* setup of sysfs entries done asynchronously in the worker thread. */ +static void dma_buf_sysfs_stats_setup_work(struct dmabuf_kobj_work *kobject_work) +{ + struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry; + struct dma_buf_sysfs_entry_metadata *sysfs_metadata = + kobject_work->sysfs_metadata; + bool free_metadata = false; + + int ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, + "%lu", kobject_work->uid); + if (ret) { + kobject_put(&sysfs_entry->kobj); + + spin_lock(&sysfs_metadata->sysfs_entry_lock); + if (sysfs_metadata->status == SYSFS_ENTRY_INIT_ABORTED) { + /* + * SYSFS_ENTRY_INIT_ABORTED means that the DMA-BUF has already + * been freed. At this point, its safe to free the memory for + * the sysfs_metadata; + */ + free_metadata = true; + } else { + /* + * The DMA-BUF has not yet been freed, set the status to + * sysfs_entry_error so that when the DMA-BUF gets + * freed, we know there is no need to teardown the sysfs + * entry. + */ + sysfs_metadata->status = SYSFS_ENTRY_ERROR; + } + goto unlock; + } + + /* + * If the DMA-BUF has not yet been released, status would still be + * SYSFS_ENTRY_INIT_IN_PROGRESS. We set the status as initialized. + */ + spin_lock(&sysfs_metadata->sysfs_entry_lock); + if (sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) { + sysfs_metadata->status = SYSFS_ENTRY_INITIALIZED; + goto unlock; + }
+ /* + * At this point the status is SYSFS_ENTRY_INIT_ABORTED which means + * that the DMA-BUF has already been freed. Hence, we cleanup the + * sysfs_entry and its metadata since neither of them are needed + * anymore. + */ + free_metadata = true; kobject_del(&sysfs_entry->kobj); kobject_put(&sysfs_entry->kobj); + +unlock: + spin_unlock(&sysfs_metadata->sysfs_entry_lock); + if (free_metadata) { + kfree(kobject_work->sysfs_metadata); + kobject_work->sysfs_metadata = NULL; + } }
+/* teardown of sysfs entries done asynchronously in the worker thread. */ +static void dma_buf_sysfs_stats_teardown_work(struct dmabuf_kobj_work *kobject_work) +{ + struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry;
-/* Statistics files do not need to send uevents. */ -static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj) + kobject_del(&sysfs_entry->kobj); + kobject_put(&sysfs_entry->kobj); + + kfree(kobject_work->sysfs_metadata); + kobject_work->sysfs_metadata = NULL; +} + +/* do setup or teardown of sysfs entries as required */ +static void do_kobject_work(struct dmabuf_kobj_work *kobject_work) { + struct dma_buf_sysfs_entry_metadata *sysfs_metadata; + bool setup_needed = false; + bool teardown_needed = false; + + sysfs_metadata = kobject_work->sysfs_metadata; + spin_lock(&sysfs_metadata->sysfs_entry_lock); + if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED) { + setup_needed = true; + sysfs_metadata->status = SYSFS_ENTRY_INIT_IN_PROGRESS; + } else if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) { + teardown_needed = true; + } + + /* + * It is ok to release the sysfs_entry_lock here. + * + * If setup_needed is true, we check the status again after the kobject + * initialization to see if it has been set to SYSFS_ENTRY_INIT_ABORTED + * and if so teardown the kobject. + * + * If teardown_needed is true, there are no more changes expected to the + * status. + * + * If neither setup_needed nor teardown needed are true, it + * means the DMA-BUF was freed before we got around to setting up the + * sysfs entry and hence we just need to release the metadata and + * return. + */ + spin_unlock(&kobject_work->sysfs_metadata->sysfs_entry_lock); + + if (setup_needed) + dma_buf_sysfs_stats_setup_work(kobject_work); + else if (teardown_needed) + dma_buf_sysfs_stats_teardown_work(kobject_work); + else + kfree(kobject_work->sysfs_metadata); + + kfree(kobject_work); +} + +static struct dmabuf_kobj_work *get_next_kobj_work(void) +{ + struct dmabuf_kobj_work *kobject_work; + + spin_lock(&dmabuf_kobj_list_lock); + kobject_work = list_first_entry_or_null(&dmabuf_kobj_work_list, + struct dmabuf_kobj_work, list); + if (kobject_work) + list_del(&kobject_work->list); + spin_unlock(&dmabuf_kobj_list_lock); + return kobject_work; +} + +static int kobject_work_thread(void *data) +{ + struct dmabuf_kobj_work *kobject_work; + + while (1) { + wait_event_freezable(dmabuf_kobject_waitqueue, + (kobject_work = get_next_kobj_work())); + do_kobject_work(kobject_work); + } + return 0; }
-static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = { - .filter = dmabuf_sysfs_uevent_filter, -}; +static int kobject_worklist_init(void) +{ + init_waitqueue_head(&dmabuf_kobject_waitqueue); + dmabuf_kobject_task = kthread_run(kobject_work_thread, NULL, + "%s", "dmabuf-kobject-worker"); + if (IS_ERR(dmabuf_kobject_task)) { + pr_err("Creating thread for deferred sysfs entry creation/deletion failed\n"); + return PTR_ERR(dmabuf_kobject_task); + } + sched_set_normal(dmabuf_kobject_task, MAX_NICE); + + return 0; +} + +static void deferred_kobject_create(struct dmabuf_kobj_work *kobject_work) +{ + INIT_LIST_HEAD(&kobject_work->list); + + spin_lock(&dmabuf_kobj_list_lock); + + list_add_tail(&kobject_work->list, &dmabuf_kobj_work_list); + + spin_unlock(&dmabuf_kobj_list_lock); + + wake_up(&dmabuf_kobject_waitqueue); +} + + +void dma_buf_stats_teardown(struct dma_buf *dmabuf) +{ + struct dma_buf_sysfs_entry *sysfs_entry; + struct dma_buf_sysfs_entry_metadata *sysfs_metadata; + struct dmabuf_kobj_work *kobj_work; + + sysfs_entry = dmabuf->sysfs_entry; + if (!sysfs_entry) + return; + + sysfs_metadata = dmabuf->sysfs_entry_metadata; + if (!sysfs_metadata) + return; + + spin_lock(&sysfs_metadata->sysfs_entry_lock); + + if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED || + sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) { + /* + * The sysfs entry for this buffer has not yet been initialized, + * we set the status to SYSFS_ENTRY_INIT_ABORTED to abort the + * initialization. + */ + sysfs_metadata->status = SYSFS_ENTRY_INIT_ABORTED; + spin_unlock(&sysfs_metadata->sysfs_entry_lock); + + /* + * In case kobject initialization completes right as we release + * the sysfs_entry_lock, disable show() for the sysfs entry by + * setting sysfs_entry->dmabuf to NULL to prevent a race. + */ + spin_lock(&dmabuf_sysfs_show_lock); + sysfs_entry->dmabuf = NULL; + spin_unlock(&dmabuf_sysfs_show_lock); + + return; + } + + if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) { + /* + * queue teardown work only if sysfs_entry is fully inititalized. + * It is ok to release the sysfs_entry_lock here since the + * status can no longer change. + */ + spin_unlock(&sysfs_metadata->sysfs_entry_lock); + + /* + * Meanwhile disable show() for the sysfs entry to avoid a race + * between teardown and show(). + */ + spin_lock(&dmabuf_sysfs_show_lock); + sysfs_entry->dmabuf = NULL; + spin_unlock(&dmabuf_sysfs_show_lock); + + kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL); + if (!kobj_work) { + /* do the teardown immediately. */ + kobject_del(&sysfs_entry->kobj); + kobject_put(&sysfs_entry->kobj); + kfree(sysfs_metadata); + } else { + /* queue teardown work. */ + kobj_work->sysfs_entry = dmabuf->sysfs_entry; + kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata; + deferred_kobject_create(kobj_work); + } + + return; + } + + /* + * status is SYSFS_ENTRY_INIT_ERROR so we only need to free the + * metadata. + */ + spin_unlock(&sysfs_metadata->sysfs_entry_lock); + kfree(dmabuf->sysfs_entry_metadata); + dmabuf->sysfs_entry_metadata = NULL; +}
static struct kset *dma_buf_stats_kset; static struct kset *dma_buf_per_buffer_stats_kset; int dma_buf_init_sysfs_statistics(void) { + int ret; + + ret = kobject_worklist_init(); + if (ret) + return ret; + dma_buf_stats_kset = kset_create_and_add("dmabuf", &dmabuf_sysfs_no_uevent_ops, kernel_kobj); @@ -171,7 +450,8 @@ void dma_buf_uninit_sysfs_statistics(void) int dma_buf_stats_setup(struct dma_buf *dmabuf) { struct dma_buf_sysfs_entry *sysfs_entry; - int ret; + struct dma_buf_sysfs_entry_metadata *sysfs_metadata; + struct dmabuf_kobj_work *kobj_work;
if (!dmabuf || !dmabuf->file) return -EINVAL; @@ -188,18 +468,35 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf) sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; sysfs_entry->dmabuf = dmabuf;
+ sysfs_metadata = kzalloc(sizeof(struct dma_buf_sysfs_entry_metadata), + GFP_KERNEL); + if (!sysfs_metadata) { + kfree(sysfs_entry); + return -ENOMEM; + } + dmabuf->sysfs_entry = sysfs_entry;
- /* create the directory for buffer stats */ - ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(dmabuf->file)->i_ino); - if (ret) - goto err_sysfs_dmabuf; + sysfs_metadata->status = SYSFS_ENTRY_UNINITIALIZED; + spin_lock_init(&sysfs_metadata->sysfs_entry_lock);
- return 0; + dmabuf->sysfs_entry_metadata = sysfs_metadata;
-err_sysfs_dmabuf: - kobject_put(&sysfs_entry->kobj); - dmabuf->sysfs_entry = NULL; - return ret; + kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL); + if (!kobj_work) { + kfree(sysfs_entry); + kfree(sysfs_metadata); + return -ENOMEM; + } + + kobj_work->sysfs_entry = dmabuf->sysfs_entry; + kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata; + /* + * stash the inode number in struct dmabuf_kobj_work since setup + * might race with DMA-BUF teardown. + */ + kobj_work->uid = file_inode(dmabuf->file)->i_ino; + + deferred_kobject_create(kobj_work); + return 0; } diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 7ab50076e7a6..0597690023a0 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -287,6 +287,50 @@ struct dma_buf_ops { void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map); };
+#ifdef CONFIG_DMABUF_SYSFS_STATS +enum sysfs_entry_status { + SYSFS_ENTRY_UNINITIALIZED, + SYSFS_ENTRY_INIT_IN_PROGRESS, + SYSFS_ENTRY_ERROR, + SYSFS_ENTRY_INIT_ABORTED, + SYSFS_ENTRY_INITIALIZED, +}; + +/* + * struct dma_buf_sysfs_entry_metadata - Holds the current status for the + * DMA-BUF sysfs entry. + * + * @status: holds the current status for the DMA-BUF sysfs entry. The status of + * the sysfs entry has the following path. + * + * SYSFS_ENTRY_UNINITIALIZED + * __________________|____________________ + * | | + * SYSFS_ENTRY_INIT_IN_PROGRESS SYSFS_ENTRY_INIT_ABORTED (DMA-BUF + * | gets freed + * | before + * | init) + * ________|_____________________________________ + * | | | + * SYSFS_ENTRY_INITIALIZED | SYSFS_ENTRY_INIT_ABORTED + * | (DMA-BUF gets freed during kobject + * | init) + * | + * | + * SYSFS_ENTRY_ERROR + * (error during kobject init) + * + * @sysfs_entry_lock: protects access to @status. + */ +struct dma_buf_sysfs_entry_metadata { + enum sysfs_entry_status status; + /* + * Protects sysfs_entry_metadata->status + */ + spinlock_t sysfs_entry_lock; +}; +#endif + /** * struct dma_buf - shared buffer object * @@ -452,6 +496,8 @@ struct dma_buf { struct kobject kobj; struct dma_buf *dmabuf; } *sysfs_entry; + + struct dma_buf_sysfs_entry_metadata *sysfs_entry_metadata; #endif };