From: Wang Wensheng <wangwensheng4(a)huawei.com>
If we can't allocate this size, try something smaller with half of the
size. Its order should be decreased by one instead of divided by two.
Link: https://lkml.kernel.org/r/20221109094434.84046-3-wangwensheng4@huawei.com
Cc: <mhiramat(a)kernel.org>
Cc: <mark.rutland(a)arm.com>
Cc: stable(a)vger.kernel.org
Fixes: a79008755497d ("ftrace: Allocate the mcount record pages as groups")
Signed-off-by: Wang Wensheng <wangwensheng4(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ftrace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8b13ce2eae70..56a168121bfc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3190,7 +3190,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
/* if we can't allocate this size, try something smaller */
if (!order)
return -ENOMEM;
- order >>= 1;
+ order--;
goto again;
}
--
2.35.1
Pass in EPOLL_URING when signaling eventfd or doing poll related wakups,
so that we can check for a circular event dependency between eventfd
and epoll. If this flag is set when our wakeup handlers are called, then
we know we have a dependency that needs to terminate multishot requests.
eventfd and epoll are the only such possible dependencies.
Cc: stable(a)vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
io_uring/io_uring.c | 4 ++--
io_uring/io_uring.h | 15 +++++++++++----
io_uring/poll.c | 8 ++++++++
3 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8840cf3e20f2..53d0043b77a5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cef5ff924e63..f6cf74cd692b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
+#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
@@ -207,12 +208,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
- * wake_up_all() may seem excessive, but io_wake_function() and
- * io_should_wake() handle the termination of the loop and only
- * wake as many waiters as we need to.
+ * Trigger waitqueue handler on all waiters on our waitqueue. This
+ * won't necessarily wake up all the tasks, io_should_wake() will make
+ * that decision.
+ *
+ * Pass in EPOLLIN|EPOLL_URING as the poll wakeup key. The latter set
+ * in the mask so that if we recurse back into our own poll waitqueue
+ * handlers, we know we have a dependency between eventfd or epoll and
+ * should terminate multishot poll at that point.
*/
if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
+ __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
+ poll_to_key(EPOLL_URING | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 055632e9092a..b5d9426c60f6 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -394,6 +394,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
return 0;
if (io_poll_get_ownership(req)) {
+ /*
+ * If we trigger a multishot poll off our own wakeup path,
+ * disable multishot as there is a circular dependency between
+ * CQ posting and triggering the event.
+ */
+ if (mask & EPOLL_URING)
+ poll->events |= EPOLLONESHOT;
+
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
--
2.35.1
This is identical to eventfd_signal(), but it allows the caller to pass
in a mask to be used for the poll wakeup key. The use case is avoiding
repeated multishot triggers if we have a dependency between eventfd and
io_uring.
If we setup an eventfd context and register that as the io_uring eventfd,
and at the same time queue a multishot poll request for the eventfd
context, then any CQE posted will repeatedly trigger the multishot request
until it terminates when the CQ ring overflows.
In preparation for io_uring detecting this circular dependency, add the
mentioned helper so that io_uring can pass in EPOLL_URING as part of the
poll wakeup key.
Cc: stable(a)vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
fs/eventfd.c | 37 +++++++++++++++++++++----------------
include/linux/eventfd.h | 1 +
2 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/fs/eventfd.c b/fs/eventfd.c
index c0ffee99ad23..249ca6c0b784 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,21 +43,7 @@ struct eventfd_ctx {
int id;
};
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented. This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
unsigned long flags;
@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
}
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ * The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ *
+ * Returns the amount by which the counter was incremented. This will be less
+ * than @n if the counter has overflowed.
+ */
+__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+{
+ return eventfd_signal_mask(ctx, n, 0);
+}
EXPORT_SYMBOL_GPL(eventfd_signal);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 30eb30d6909b..e849329ce1a8 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd);
struct eventfd_ctx *eventfd_ctx_fdget(int fd);
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
--
2.35.1
We can have dependencies between epoll and io_uring. Consider an epoll
context, identified by the epfd file descriptor, and an io_uring file
descriptor identified by iofd. If we add iofd to the epfd context, and
arm a multishot poll request for epfd with iofd, then the multishot
poll request will repeatedly trigger and generate events until terminated
by CQ ring overflow. This isn't a desired behavior.
Add EPOLL_URING so that io_uring can pass it in as part of the poll wakeup
key, and io_uring can check for that to detect a potential recursive
invocation.
Cc: stable(a)vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
fs/eventpoll.c | 18 ++++++++++--------
include/uapi/linux/eventpoll.h | 6 ++++++
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 52954d4637b5..e864256001e8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
struct eventpoll *ep_src;
unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
- wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
- wake_up_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}
#endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
/*
* We need to lock this because we could be hit by
@@ -1208,7 +1210,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, epi);
+ ep_poll_safewake(ep, epi, pollflags & EPOLL_URING);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..532cc7fa75c0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -41,6 +41,12 @@
#define EPOLLMSG (__force __poll_t)0x00000400
#define EPOLLRDHUP (__force __poll_t)0x00002000
+/*
+ * Internal flag - wakeup generated by io_uring, used to detect recursion back
+ * into the io_uring poll handler.
+ */
+#define EPOLL_URING ((__force __poll_t)(1U << 27))
+
/* Set exclusive wakeup mode for the target file descriptor */
#define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28))
--
2.35.1
The member void *data in the structure devfreq can be overwrite
by governor_userspace. For example:
1. The device driver assigned the devfreq governor to simple_ondemand
by the function devfreq_add_device() and init the devfreq member
void *data to a pointer of a static structure devfreq_simple_ondemand_data
by the function devfreq_add_device().
2. The user changed the devfreq governor to userspace by the command
"echo userspace > /sys/class/devfreq/.../governor".
3. The governor userspace alloced a dynamic memory for the struct
userspace_data and assigend the member void *data of devfreq to
this memory by the function userspace_init().
4. The user changed the devfreq governor back to simple_ondemand
by the command "echo simple_ondemand > /sys/class/devfreq/.../governor".
5. The governor userspace exited and assigned the member void *data
in the structure devfreq to NULL by the function userspace_exit().
6. The governor simple_ondemand fetched the static information of
devfreq_simple_ondemand_data in the function
devfreq_simple_ondemand_func() but the member void *data of devfreq was
assigned to NULL by the function userspace_exit().
7. The information of upthreshold and downdifferential is lost
and the governor simple_ondemand can't work correctly.
The member void *data in the structure devfreq is designed for
a static pointer used in a governor and inited by the function
devfreq_add_device(). This patch add an element named governor_data
in the devfreq structure which can be used by a governor(E.g userspace)
who want to assign a private data to do some private things.
Fixes: ce26c5bb9569 ("PM / devfreq: Add basic governors")
Cc: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Chanwoo Choi <cwchoi00(a)gmail.com>
Acked-by: MyungJoo Ham <myungjoo.ham(a)samsung.com>
Signed-off-by: Kant Fan <kant(a)allwinnertech.com>
---
drivers/devfreq/devfreq.c | 6 ++----
drivers/devfreq/governor_userspace.c | 12 ++++++------
include/linux/devfreq.h | 7 ++++---
3 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 63347a5ae599..8c5f6f7fca11 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -776,8 +776,7 @@ static void remove_sysfs_files(struct devfreq *devfreq,
* @dev: the device to add devfreq feature.
* @profile: device-specific profile to run devfreq.
* @governor_name: name of the policy to choose frequency.
- * @data: private data for the governor. The devfreq framework does not
- * touch this value.
+ * @data: devfreq driver pass to governors, governor should not change it.
*/
struct devfreq *devfreq_add_device(struct device *dev,
struct devfreq_dev_profile *profile,
@@ -1011,8 +1010,7 @@ static void devm_devfreq_dev_release(struct device *dev, void *res)
* @dev: the device to add devfreq feature.
* @profile: device-specific profile to run devfreq.
* @governor_name: name of the policy to choose frequency.
- * @data: private data for the governor. The devfreq framework does not
- * touch this value.
+ * @data: devfreq driver pass to governors, governor should not change it.
*
* This function manages automatically the memory of devfreq device using device
* resource management and simplify the free operation for memory of devfreq
diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c
index ab9db7adb3ad..d69672ccacc4 100644
--- a/drivers/devfreq/governor_userspace.c
+++ b/drivers/devfreq/governor_userspace.c
@@ -21,7 +21,7 @@ struct userspace_data {
static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq)
{
- struct userspace_data *data = df->data;
+ struct userspace_data *data = df->governor_data;
if (data->valid)
*freq = data->user_frequency;
@@ -40,7 +40,7 @@ static ssize_t set_freq_store(struct device *dev, struct device_attribute *attr,
int err = 0;
mutex_lock(&devfreq->lock);
- data = devfreq->data;
+ data = devfreq->governor_data;
sscanf(buf, "%lu", &wanted);
data->user_frequency = wanted;
@@ -60,7 +60,7 @@ static ssize_t set_freq_show(struct device *dev,
int err = 0;
mutex_lock(&devfreq->lock);
- data = devfreq->data;
+ data = devfreq->governor_data;
if (data->valid)
err = sprintf(buf, "%lu\n", data->user_frequency);
@@ -91,7 +91,7 @@ static int userspace_init(struct devfreq *devfreq)
goto out;
}
data->valid = false;
- devfreq->data = data;
+ devfreq->governor_data = data;
err = sysfs_create_group(&devfreq->dev.kobj, &dev_attr_group);
out:
@@ -107,8 +107,8 @@ static void userspace_exit(struct devfreq *devfreq)
if (devfreq->dev.kobj.sd)
sysfs_remove_group(&devfreq->dev.kobj, &dev_attr_group);
- kfree(devfreq->data);
- devfreq->data = NULL;
+ kfree(devfreq->governor_data);
+ devfreq->governor_data = NULL;
}
static int devfreq_userspace_handler(struct devfreq *devfreq,
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 34aab4dd336c..4dc7cda4fd46 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -152,8 +152,8 @@ struct devfreq_stats {
* @max_state: count of entry present in the frequency table.
* @previous_freq: previously configured frequency value.
* @last_status: devfreq user device info, performance statistics
- * @data: Private data of the governor. The devfreq framework does not
- * touch this.
+ * @data: devfreq driver pass to governors, governor should not change it.
+ * @governor_data: private data for governors, devfreq core doesn't touch it.
* @user_min_freq_req: PM QoS minimum frequency request from user (via sysfs)
* @user_max_freq_req: PM QoS maximum frequency request from user (via sysfs)
* @scaling_min_freq: Limit minimum frequency requested by OPP interface
@@ -193,7 +193,8 @@ struct devfreq {
unsigned long previous_freq;
struct devfreq_dev_status last_status;
- void *data; /* private data for governors */
+ void *data;
+ void *governor_data;
struct dev_pm_qos_request user_min_freq_req;
struct dev_pm_qos_request user_max_freq_req;
--
2.29.0