Hi all,
This patch set implements a userland-side API for generic deferrable timers, per linux/timer.h:
* A deferrable timer will work normally when the system is busy, but * will not cause a CPU to come out of idle just to service it; instead, * the timer will be serviced when the CPU eventually wakes up with a * subsequent non-deferrable timer.
These timers are crucial for power saving, i.e. periodic tasks that want to work in background when the system is under use, but don't want to cause wakeups themselves.
The deferred timers are somewhat orthogonal to high-res external timers, since the deferred timer is tied to the system load, not just to some external decrementer source.
So, currently, the implementation has a HZ precision, and the maximum interval is jiffies resolution (i.e. with HZ=1000, on 32 bit that would be around max 49 days). Of course we can implement longer timeouts by rearming the timer, although it probably wouldn't make much sense in real world, so we keep it simple and just return E2BIG if we don't like the interval.
Note that the code is still using time calculation that is done by the hrtimer routines, so we pretty much reuse everything except for the timer events themselves (i.e. we use calculation results of hrtimer_forward_now() and hrtimer_expires_remaining(), but never start the hrtimer). So the code path is pretty much the same for both hrtimers and deferrable timers.
We will use the timers to periodically read /proc/vmstat without forcibly waking up the system; but let's see, maybe there are other use cases that might be interesting for PM folks.
Thanks!
Anton.
-- fs/timerfd.c | 87 +++++++++++++++++++++++++++++++++++++++++++------ include/linux/jiffies.h | 3 ++ include/linux/ktime.h | 3 +- include/linux/timerfd.h | 4 ++- kernel/time.c | 23 +++++++++++++ 5 files changed, 108 insertions(+), 12 deletions(-)
Two new functions: jiffies_to_ktime() and ktime_to_jiffies(), we'll use them for timerfd deferred timers handling.
We fully reuse the logic from timespec implementations, so the functions are pretty straightforward.
The only tricky part is in headers: we have to include jiffies.h after we defined ktime_t, this is because ktime.h needs some declarations from jiffies.h (e.g. TICK_NSEC).
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- include/linux/jiffies.h | 3 +++ include/linux/ktime.h | 3 ++- kernel/time.c | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 265e2c3..4451241 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> +#include <linux/ktime.h> #include <asm/param.h> /* for HZ */
/* @@ -303,6 +304,8 @@ extern void jiffies_to_timespec(const unsigned long jiffies, extern unsigned long timeval_to_jiffies(const struct timeval *value); extern void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value); +extern unsigned long ktime_to_jiffies(ktime_t *value); +extern void jiffies_to_ktime(const unsigned long jiffies, ktime_t *value); extern clock_t jiffies_to_clock_t(unsigned long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 603bec2..9551856 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -22,7 +22,6 @@ #define _LINUX_KTIME_H
#include <linux/time.h> -#include <linux/jiffies.h>
/* * ktime_t: @@ -58,6 +57,8 @@ union ktime {
typedef union ktime ktime_t; /* Kill this */
+#include <linux/jiffies.h> + #define KTIME_MAX ((s64)~((u64)1 << 63)) #if (BITS_PER_LONG == 64) # define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/kernel/time.c b/kernel/time.c index ba744cf..82c06c5 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -29,6 +29,7 @@
#include <linux/export.h> #include <linux/timex.h> +#include <linux/ktime.h> #include <linux/capability.h> #include <linux/clocksource.h> #include <linux/errno.h> @@ -566,6 +567,28 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) } EXPORT_SYMBOL(jiffies_to_timeval);
+unsigned long ktime_to_jiffies(ktime_t *value) +{ + struct timespec ts = ktime_to_timespec(*value); + + /* + * nsecs_to_jiffies(ktime_to_ns(*ktime)) is unsafe as nsecs_to_jiffies + * doesn't handle MAX_JIFFY_OFFSET. So we reuse the logic from the + * timespec to jiffies conversion function. + */ + return timespec_to_jiffies(&ts); +} +EXPORT_SYMBOL(ktime_to_jiffies); + +void jiffies_to_ktime(const unsigned long jiffies, ktime_t *value) +{ + struct timespec ts; + + jiffies_to_timespec(jiffies, &ts); + *value = timespec_to_ktime(ts); +} +EXPORT_SYMBOL(jiffies_to_ktime); + /* * Convert jiffies/jiffies_64 to clock_t and back. */
This patch introduces timerfd_rearm(), this small helper is used to forward and restart the hrtimer.
This small refactoring would be also useful if/when we'll add other backend for timerfd (like deferrable timers), so we won't need to duplicate the code more.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- fs/timerfd.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/fs/timerfd.c b/fs/timerfd.c index dffeb37..ecfb3f3 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -178,6 +178,14 @@ static unsigned int timerfd_poll(struct file *file, poll_table *wait) return events; }
+static u64 timerfd_rearm(struct timerfd_ctx *ctx) +{ + u64 orun = hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; + + hrtimer_restart(&ctx->tmr); + return orun; +} + static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -214,9 +222,7 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, * callback to avoid DoS attacks specifying a very * short timer period. */ - ticks += hrtimer_forward_now(&ctx->tmr, - ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + ticks += timerfd_rearm(ctx); } ctx->expired = 0; ctx->ticks = 0; @@ -355,9 +361,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv.tv64) { ctx->expired = 0; - ctx->ticks += - hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + ctx->ticks += timerfd_rearm(ctx); } kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); kotmr.it_interval = ktime_to_timespec(ctx->tintv);
There is nothing hrtimer-specific inside the timerfd_tmrproc(), except the function prototype. We're about to add other timer types, so factor out generic timerfd_expire() helper from timerfd_tmrproc().
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- fs/timerfd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/timerfd.c b/fs/timerfd.c index ecfb3f3..222db32 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -45,9 +45,8 @@ static DEFINE_SPINLOCK(cancel_lock); * flag, but we do not re-arm the timer (in case it's necessary, * tintv.tv64 != 0) until the timer is accessed. */ -static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +static void timerfd_expire(struct timerfd_ctx *ctx) { - struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr); unsigned long flags;
spin_lock_irqsave(&ctx->wqh.lock, flags); @@ -56,6 +55,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+} + +static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +{ + timerfd_expire(container_of(htmr, struct timerfd_ctx, tmr)); return HRTIMER_NORESTART; }
This patch implements a userland-side API for generic deferrable timers, per linux/timer.h:
* A deferrable timer will work normally when the system is busy, but * will not cause a CPU to come out of idle just to service it; instead, * the timer will be serviced when the CPU eventually wakes up with a * subsequent non-deferrable timer.
These timers are crucial for power saving, i.e. periodic tasks that want to work in background when the system is under use, but don't want to cause wakeups themselves.
The deferred timers are somewhat orthogonal to high-res external timers, since the deferred timer is tied to the system load, not just to some external decrementer source.
So, currently, the implementation has a HZ precision, and the maximum interval is jiffies resolution (i.e. with HZ=1000, on 32 bit that would be around max 49 days). Of course we can implement longer timeouts by rearming the timer, although it probably wouldn't make much sense in real world, so we keep it simple and just return E2BIG if we don't like the interval.
Note that the code is still using time calculation that is done by the hrtimer routines, so we pretty much reuse everything except for the timer events themselves (i.e. we use calculation results of hrtimer_forward_now() and hrtimer_expires_remaining(), but never start the hrtimer). So the code path is pretty much the same for both hrtimers and deferrable timers.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- fs/timerfd.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/timerfd.h | 4 ++- 2 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/fs/timerfd.c b/fs/timerfd.c index 222db32..d608a57 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -18,6 +18,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/time.h> +#include <linux/timer.h> #include <linux/hrtimer.h> #include <linux/anon_inodes.h> #include <linux/timerfd.h> @@ -26,6 +27,8 @@
struct timerfd_ctx { struct hrtimer tmr; + struct timer_list dtmr; + bool deferrable; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; @@ -63,6 +66,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; }
+static void timerfd_dtmrproc(unsigned long data) +{ + timerfd_expire((struct timerfd_ctx *)data); +} + /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The @@ -131,6 +139,30 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; }
+static bool timerfd_deferrable_valid(ktime_t intv) +{ + ktime_t max; + + jiffies_to_ktime(MAX_JIFFY_OFFSET, &max); + if (intv.tv64 > max.tv64) + return 0; + return 1; +} + +static int timerfd_setup_deferrable(struct timerfd_ctx *ctx) +{ + ktime_t rem = timerfd_get_remaining(ctx); + + if (ctx->clockid != CLOCK_MONOTONIC) + return -EINVAL; + if (!timerfd_deferrable_valid(ctx->tintv) || + !timerfd_deferrable_valid(rem)) + return -E2BIG; + + mod_timer(&ctx->dtmr, jiffies + ktime_to_jiffies(&rem) + 1); + return 0; +} + static int timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec *ktmr) { @@ -148,8 +180,18 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, hrtimer_init(&ctx->tmr, clockid, htmode); hrtimer_set_expires(&ctx->tmr, texp); ctx->tmr.function = timerfd_tmrproc; + ctx->dtmr.function = timerfd_dtmrproc; + ctx->dtmr.data = (unsigned long)ctx; if (texp.tv64 != 0) { - hrtimer_start(&ctx->tmr, texp, htmode); + if (ctx->deferrable) { + int ret; + + ret = timerfd_setup_deferrable(ctx); + if (ret) + return ret; + } else { + hrtimer_start(&ctx->tmr, texp, htmode); + } if (timerfd_canceled(ctx)) return -ECANCELED; } @@ -162,6 +204,7 @@ static int timerfd_release(struct inode *inode, struct file *file)
timerfd_remove_cancel(ctx); hrtimer_cancel(&ctx->tmr); + del_timer_sync(&ctx->dtmr); kfree_rcu(ctx, rcu); return 0; } @@ -186,7 +229,12 @@ static u64 timerfd_rearm(struct timerfd_ctx *ctx) { u64 orun = hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr); + if (ctx->deferrable) + mod_timer(&ctx->dtmr, jiffies + + ktime_to_jiffies(&ctx->tintv) + 1); + else + hrtimer_restart(&ctx->tmr); + return orun; }
@@ -280,6 +328,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + init_timer_deferrable(&ctx->dtmr); ctx->moffs = ktime_get_monotonic_offset();
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, @@ -319,13 +368,23 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, * it to the new values. */ for (;;) { + int canceled; + spin_lock_irq(&ctx->wqh.lock); - if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) + if (ctx->deferrable) + canceled = try_to_del_timer_sync(&ctx->dtmr); + else + canceled = hrtimer_try_to_cancel(&ctx->tmr); + + if (canceled >= 0) break; spin_unlock_irq(&ctx->wqh.lock); cpu_relax(); }
+ /* Must set a new value after we cancel the previous timer. */ + ctx->deferrable = flags & TFD_TIMER_DEFERRABLE; + /* * If the timer is expired and it's periodic, we need to advance it * because the caller may want to know the previous expiration time. diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index d3b57fa..33d9842 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -20,6 +20,7 @@ */ #define TFD_TIMER_ABSTIME (1 << 0) #define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_TIMER_DEFERRABLE (1 << 2) #define TFD_CLOEXEC O_CLOEXEC #define TFD_NONBLOCK O_NONBLOCK
@@ -27,6 +28,7 @@ /* Flags for timerfd_create. */ #define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS /* Flags for timerfd_settime. */ -#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET | \ + TFD_TIMER_DEFERRABLE)
#endif /* _LINUX_TIMERFD_H */
Hi all,
On Sat, Sep 01, 2012 at 10:43:35PM -0700, Anton Vorontsov wrote:
This patch set implements a userland-side API for generic deferrable timers, per linux/timer.h:
- A deferrable timer will work normally when the system is busy, but
- will not cause a CPU to come out of idle just to service it; instead,
- the timer will be serviced when the CPU eventually wakes up with a
- subsequent non-deferrable timer.
These timers are crucial for power saving, i.e. periodic tasks that want to work in background when the system is under use, but don't want to cause wakeups themselves.
Just a friendly ping. Does anyone had a chance to look into this, whether the idea bad or good, or whether the implementation is OK?
Thanks!
Anton.
linaro-kernel@lists.linaro.org