Skip to content

Commit 2fe59f5

Browse files
npigginKAGA-KOKO
authored andcommitted
timers: Fix excessive granularity of new timers after a nohz idle
When a timer base is idle, it is forwarded when a new timer is added to ensure that granularity does not become excessive. When not idle, the timer tick is expected to increment the base. However there are several problems: - If an existing timer is modified, the base is forwarded only after the index is calculated. - The base is not forwarded by add_timer_on. - There is a window after a timer is restarted from a nohz idle, after it is marked not-idle and before the timer tick on this CPU, where a timer may be added but the ancient base does not get forwarded. These result in excessive granularity (a 1 jiffy timeout can blow out to 100s of jiffies), which cause the rcu lockup detector to trigger, among other things. Fix this by keeping track of whether the timer base has been idle since it was last run or forwarded, and if so then forward it before adding a new timer. There is still a case where mod_timer optimises the case of a pending timer mod with the same expiry time, where the timer can see excessive granularity relative to the new, shorter interval. A comment is added, but it's not changed because it is an important fastpath for networking. This has been tested and found to fix the RCU softlockup messages. Testing was also done with tracing to measure requested versus achieved wakeup latencies for all non-deferrable timers in an idle system (with no lockup watchdogs running). Wakeup latency relative to absolute latency is calculated (note this suffers from round-up skew at low absolute times) and analysed: max avg std upstream 506.0 1.20 4.68 patched 2.0 1.08 0.15 The bug was noticed due to the lockup detector Kconfig changes dropping it out of people's .configs and resulting in larger base clk skew When the lockup detectors are enabled, no CPU can go idle for longer than 4 seconds, which limits the granularity errors. Sub-optimal timer behaviour is observable on a smaller scale in that case: max avg std upstream 9.0 1.05 0.19 patched 2.0 1.04 0.11 Fixes: Fixes: a683f39 ("timers: Forward the wheel clock whenever possible") Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: David Miller <davem@davemloft.net> Cc: dzickus@redhat.com Cc: sfr@canb.auug.org.au Cc: mpe@ellerman.id.au Cc: Stephen Boyd <sboyd@codeaurora.org> Cc: linuxarm@huawei.com Cc: abdhalee@linux.vnet.ibm.com Cc: John Stultz <john.stultz@linaro.org> Cc: akpm@linux-foundation.org Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com
1 parent 14ccee7 commit 2fe59f5

1 file changed

Lines changed: 41 additions & 9 deletions

File tree

kernel/time/timer.c

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ struct timer_base {
203203
bool migration_enabled;
204204
bool nohz_active;
205205
bool is_idle;
206+
bool must_forward_clk;
206207
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
207208
struct hlist_head vectors[WHEEL_SIZE];
208209
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
856857

857858
static inline void forward_timer_base(struct timer_base *base)
858859
{
859-
unsigned long jnow = READ_ONCE(jiffies);
860+
unsigned long jnow;
860861

861862
/*
862-
* We only forward the base when it's idle and we have a delta between
863-
* base clock and jiffies.
863+
* We only forward the base when we are idle or have just come out of
864+
* idle (must_forward_clk logic), and have a delta between base clock
865+
* and jiffies. In the common case, run_timers will take care of it.
864866
*/
865-
if (!base->is_idle || (long) (jnow - base->clk) < 2)
867+
if (likely(!base->must_forward_clk))
868+
return;
869+
870+
jnow = READ_ONCE(jiffies);
871+
base->must_forward_clk = base->is_idle;
872+
if ((long)(jnow - base->clk) < 2)
866873
return;
867874

868875
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
938945
* same array bucket then just return:
939946
*/
940947
if (timer_pending(timer)) {
948+
/*
949+
* The downside of this optimization is that it can result in
950+
* larger granularity than you would get from adding a new
951+
* timer with this expiry.
952+
*/
941953
if (timer->expires == expires)
942954
return 1;
943955

@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
948960
* dequeue/enqueue dance.
949961
*/
950962
base = lock_timer_base(timer, &flags);
963+
forward_timer_base(base);
951964

952965
clk = base->clk;
953966
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
964977
}
965978
} else {
966979
base = lock_timer_base(timer, &flags);
980+
forward_timer_base(base);
967981
}
968982

969983
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
9911005
raw_spin_lock(&base->lock);
9921006
WRITE_ONCE(timer->flags,
9931007
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
1008+
forward_timer_base(base);
9941009
}
9951010
}
9961011

997-
/* Try to forward a stale timer base clock */
998-
forward_timer_base(base);
999-
10001012
timer->expires = expires;
10011013
/*
10021014
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
11121124
WRITE_ONCE(timer->flags,
11131125
(timer->flags & ~TIMER_BASEMASK) | cpu);
11141126
}
1127+
forward_timer_base(base);
11151128

11161129
debug_activate(timer, timer->expires);
11171130
internal_add_timer(base, timer);
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
14971510
if (!is_max_delta)
14981511
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
14991512
/*
1500-
* If we expect to sleep more than a tick, mark the base idle:
1513+
* If we expect to sleep more than a tick, mark the base idle.
1514+
* Also the tick is stopped so any added timer must forward
1515+
* the base clk itself to keep granularity small. This idle
1516+
* logic is only maintained for the BASE_STD base, deferrable
1517+
* timers may still see large granularity skew (by design).
15011518
*/
1502-
if ((expires - basem) > TICK_NSEC)
1519+
if ((expires - basem) > TICK_NSEC) {
1520+
base->must_forward_clk = true;
15031521
base->is_idle = true;
1522+
}
15041523
}
15051524
raw_spin_unlock(&base->lock);
15061525

@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
16111630
{
16121631
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
16131632

1633+
/*
1634+
* must_forward_clk must be cleared before running timers so that any
1635+
* timer functions that call mod_timer will not try to forward the
1636+
* base. idle trcking / clock forwarding logic is only used with
1637+
* BASE_STD timers.
1638+
*
1639+
* The deferrable base does not do idle tracking at all, so we do
1640+
* not forward it. This can result in very large variations in
1641+
* granularity for deferrable timers, but they can be deferred for
1642+
* long periods due to idle.
1643+
*/
1644+
base->must_forward_clk = false;
1645+
16141646
__run_timers(base);
16151647
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
16161648
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));

0 commit comments

Comments
 (0)