On Tue, Jul 15, 2025 at 09:42:52AM -0700, Linus Torvalds wrote:
On Tue, 15 Jul 2025 at 05:47, Nam Cao namcao@linutronix.de wrote:
fs/eventpoll.c | 139 +++++++++---------------------------------------- 1 file changed, 26 insertions(+), 113 deletions(-)
Yeah, this is more like the kind of diffstat I like to see for eventpoll. Plus it makes things fundamentally simpler.
It might be worth looking at ep_poll_callback() - the only case that had read_lock_irqsave() - and seeing if perhaps some of the tests inside the lock might be done optimistically, or delayed to after the lock.
For example, the whole wakeup sequence looks like it should be done *after* the ep->lock has been released, because it uses its own lock (ie the
if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq);
thing uses the wq lock, and there is nothing that ep->lock protects there as far as I can tell.
So I think this has some potential for _simple_ optimizations, but I'm not sure how worth it it is.
Actually ep->lock does protect this part. Because ep_poll() touches ep->wq without holding the waitqueue's lock.
We could do something like the diff below. But things are not so simple anymore.
Best regards, Nam
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a171f7e7dacc..5e6c7da606e7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1252,8 +1252,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0;
- spin_lock_irqsave(&ep->lock, flags); - ep_set_busy_poll_napi_id(epi);
/* @@ -1263,7 +1261,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) - goto out_unlock; + goto out;
/* * Check the events coming with the callback. At this stage, not @@ -1272,7 +1270,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) - goto out_unlock; + goto out;
/* * If we are transferring events to userspace, we can hold no locks @@ -1280,6 +1278,8 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ + spin_lock_irqsave(&ep->lock, flags); + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = READ_ONCE(ep->ovflist); @@ -1292,9 +1292,14 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v ep_pm_stay_awake_rcu(epi); }
+ spin_unlock_irqrestore(&ep->lock, flags); + + /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. + * + * Memory barrier for waitqueue_active() from spin_unlock_irqrestore(). */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && @@ -1321,9 +1326,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v if (waitqueue_active(&ep->poll_wait)) pwake++;
-out_unlock: - spin_unlock_irqrestore(&ep->lock, flags); - +out: /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); @@ -2001,13 +2004,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function;
- spin_lock_irq(&ep->lock); - /* - * Barrierless variant, waitqueue_active() is called under - * the same lock on wakeup ep_poll_callback() side, so it - * is safe to avoid an explicit barrier. - */ - __set_current_state(TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(&ep->wq, &wait, TASK_INTERRUPTIBLE);
/* * Do the final check under the lock. ep_start/done_scan() @@ -2016,10 +2013,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * period of time although events are pending, so lock is * important. */ + spin_lock_irq(&ep->lock); eavail = ep_events_available(ep); - if (!eavail) - __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->lock);
if (!eavail) @@ -2036,7 +2031,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1;
if (!list_empty_careful(&wait.entry)) { - spin_lock_irq(&ep->lock); + spin_lock_irq(&ep->wq.lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2047,7 +2042,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - spin_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->wq.lock); } } }