On systems with 32-bit time_t there are quite a few problems that applications may have with time overflowing in year 2038. Beside getting to an unexpected state by not checking integer operations with time_t variables, some system calls have unexpected behavior, e.g. the system time can't be set back to the current time (negative value), timers with the ABSTIME flag can't be set (negative value) or they expire immediately (current time is always larger).
It would be unrealistic to expect all applications to be able to handle all these problems. Year 2038 is still many years away, but this can be a problem even now. The clock can get close to the overflow accidentally or maliciously, e.g. when it is synchronized over network by NTP or PTP.
This patch sets a maximum value of the system time to prevent the system time from getting too close to the overflow. The time can't be set to a larger value. When the maximum is reached in normal time accumulation, the clock will be stepped back by one week.
A new kernel sysctl time_max is added to select the maximum time. It can be set to 0 for no limit, 1 for one week before 32-bit time_t overflow, and 2 for one week before ktime_t overflow. The default value is 1 with 32-bit time_t and 2 with 64-bit time_t. This can be changed later to be always 2 when 64-bit versions of system calls working with time_t are available.
Cc: John Stultz john.stultz@linaro.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Prarit Bhargava prarit@redhat.com Cc: Richard Cochran richardcochran@gmail.com Cc: Arnd Bergmann arnd@arndb.de Signed-off-by: Miroslav Lichvar mlichvar@redhat.com ---
v2: - optimized code in accumulate_nsecs_to_secs() a bit - improved log message and increased its level to warning
Documentation/sysctl/kernel.txt | 19 ++++++++++++ include/linux/timekeeping.h | 5 ++++ include/uapi/linux/sysctl.h | 1 + kernel/sysctl.c | 9 ++++++ kernel/sysctl_binary.c | 1 + kernel/time/timekeeping.c | 64 +++++++++++++++++++++++++++++++++++++---- 6 files changed, 93 insertions(+), 6 deletions(-)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6fccb69..9b2bbdd 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -83,6 +83,7 @@ show up in /proc/sys/kernel: - sysctl_writes_strict - tainted - threads-max +- time_max - unknown_nmi_panic - watchdog - watchdog_thresh @@ -893,6 +894,24 @@ available RAM pages threads-max is reduced accordingly.
==============================================================
+time_max: + +Select the maximum allowed value of the system time. The system clock cannot be +set to a larger value and when it reaches the maximum on its own, it will be +stepped back by one week. + +0: No limit. + +1: One week before 32-bit time_t overflows, i.e. Jan 12 03:14:07 UTC 2038. + This is currently the default value with 32-bit time_t, but it may change + when 64-bit versions of system calls working with time_t are available. + +2: One week before time in the internal kernel representation (ktime_t) + overflows, i.e. Apr 4 23:47:16 UTC 2262. This is the default value with + 64-bit time_t. + +============================================================== + unknown_nmi_panic:
The value in this file affects behavior of handling NMI. When the diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ba0ae09..f25df65 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -5,6 +5,11 @@
void timekeeping_init(void); extern int timekeeping_suspended; +extern int sysctl_time_max; + +struct ctl_table; +extern int sysctl_time_max_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos);
/* * Get and set timeofday diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 0956373..8fd2aab 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -154,6 +154,7 @@ enum KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ + KERN_TIMEMAX=78, /* int: select maximum allowed system time */ };
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69201d..c2cfb33 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1130,6 +1130,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "time_max", + .data = &sysctl_time_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_time_max_handler, + .extra1 = &zero, + .extra2 = &two, + }, #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) { .procname = "timer_migration", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a..66c0946 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -138,6 +138,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_INT, KERN_TIMEMAX, "time_max" }, {} };
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6..4f4653b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,6 +32,9 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2)
+#define SEC_PER_DAY (24 * 3600) +#define SEC_PER_WEEK (7 * SEC_PER_DAY) + /* * The most important data for readout fits into a single 64 byte * cache line. @@ -884,6 +887,37 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
#endif /* CONFIG_NTP_PPS */
+/* Maximum allowed system time as a sysctl setting and in seconds */ +int sysctl_time_max __read_mostly; +static u64 time_max_sec __read_mostly; + +static void update_time_max_sec(int tm) +{ + if (tm > 1) { + /* One week before ktime_t overflow */ + time_max_sec = KTIME_SEC_MAX - SEC_PER_WEEK; + } else if (tm == 1) { + /* One week before 32-bit time_t overflow */ + time_max_sec = 0x7fffffff - SEC_PER_WEEK; + } else { + /* No limit */ + time_max_sec = -1; + } +} + +int sysctl_time_max_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + update_time_max_sec(sysctl_time_max); + return 0; +} + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -913,7 +947,7 @@ int do_settimeofday64(const struct timespec64 *ts) unsigned long flags; int ret = 0;
- if (!timespec64_valid_strict(ts)) + if (!timespec64_valid_strict(ts) || ts->tv_sec >= time_max_sec) return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -972,7 +1006,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), ts64); if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || - !timespec64_valid_strict(&tmp)) { + !timespec64_valid_strict(&tmp) || tmp.tv_sec >= time_max_sec) { ret = -EINVAL; goto error; } @@ -1237,6 +1271,10 @@ void __init timekeeping_init(void) write_seqcount_begin(&tk_core.seq); ntp_init();
+ /* For now, prevent 32-bit time_t overflow by default */ + sysctl_time_max = sizeof(time_t) > 4 ? 2 : 1; + update_time_max_sec(sysctl_time_max); + clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -1692,17 +1730,31 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
/* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); - if (unlikely(leap)) { + + if (unlikely(leap || tk->xtime_sec >= time_max_sec)) { struct timespec64 ts; + s64 step = leap; + + /* If the system time has reached the allowed maximum, + step it back by one week */ + if (tk->xtime_sec >= time_max_sec) { + step = time_max_sec - tk->xtime_sec; + step -= SEC_PER_WEEK; + printk(KERN_WARNING + "Clock: maximum time reached, stepping" + " back to one week before maximum\n"); + }
- tk->xtime_sec += leap; + tk->xtime_sec += step;
- ts.tv_sec = leap; + ts.tv_sec = step; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts));
- __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + if (leap) + __timekeeping_set_tai_offset(tk, + tk->tai_offset - leap);
clock_set = TK_CLOCK_WAS_SET; }