blob: 2aab44bec2a576b35f9c6cf0c6346e985a3a0dea [file] [log] [blame]
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -07001/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070014#include <linux/kernel_stat.h>
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -070015
16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h>
18
19#include <xen/events.h>
20#include <xen/interface/xen.h>
21#include <xen/interface/vcpu.h>
22
23#include "xen-ops.h"
24
25#define XEN_SHIFT 22
26
27/* Xen may fire a timer up to this many ns early */
28#define TIMER_SLOP 100000
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070029#define NS_PER_TICK (1000000000LL / HZ)
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -070030
Jeremy Fitzhardingeab550282007-07-17 18:37:05 -070031static cycle_t xen_clocksource_read(void);
32
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -070033/* These are perodically updated in shared_info, and then copied here. */
34struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
37 u32 tsc_to_nsec_mul;
38 int tsc_shift;
39 u32 version;
40};
41
42static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070044/* runstate info updated by Xen */
45static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46
47/* snapshots of runstate info */
48static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
49
50/* unused ns of stolen and blocked time */
51static DEFINE_PER_CPU(u64, residual_stolen);
52static DEFINE_PER_CPU(u64, residual_blocked);
53
54/* return an consistent snapshot of 64-bit time/counter value */
55static u64 get64(const u64 *p)
56{
57 u64 ret;
58
59 if (BITS_PER_LONG < 64) {
60 u32 *p32 = (u32 *)p;
61 u32 h, l;
62
63 /*
64 * Read high then low, and then make sure high is
65 * still the same; this will only loop if low wraps
66 * and carries into high.
67 * XXX some clean way to make this endian-proof?
68 */
69 do {
70 h = p32[1];
71 barrier();
72 l = p32[0];
73 barrier();
74 } while (p32[1] != h);
75
76 ret = (((u64)h) << 32) | l;
77 } else
78 ret = *p;
79
80 return ret;
81}
82
83/*
84 * Runstate accounting
85 */
86static void get_runstate_snapshot(struct vcpu_runstate_info *res)
87{
88 u64 state_time;
89 struct vcpu_runstate_info *state;
90
91 preempt_disable();
92
93 state = &__get_cpu_var(runstate);
94
95 /*
96 * The runstate info is always updated by the hypervisor on
97 * the current CPU, so there's no need to use anything
98 * stronger than a compiler barrier when fetching it.
99 */
100 do {
101 state_time = get64(&state->state_entry_time);
102 barrier();
103 *res = *state;
104 barrier();
105 } while (get64(&state->state_entry_time) != state_time);
106
107 preempt_enable();
108}
109
110static void setup_runstate_info(int cpu)
111{
112 struct vcpu_register_runstate_memory_area area;
113
114 area.addr.v = &per_cpu(runstate, cpu);
115
116 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
117 cpu, &area))
118 BUG();
119}
120
121static void do_stolen_accounting(void)
122{
123 struct vcpu_runstate_info state;
124 struct vcpu_runstate_info *snap;
125 s64 blocked, runnable, offline, stolen;
126 cputime_t ticks;
127
128 get_runstate_snapshot(&state);
129
130 WARN_ON(state.state != RUNSTATE_running);
131
132 snap = &__get_cpu_var(runstate_snapshot);
133
134 /* work out how much time the VCPU has not been runn*ing* */
135 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
136 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
137 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
138
139 *snap = state;
140
141 /* Add the appropriate number of ticks of stolen time,
142 including any left-overs from last time. Passing NULL to
143 account_steal_time accounts the time as stolen. */
144 stolen = runnable + offline + __get_cpu_var(residual_stolen);
145
146 if (stolen < 0)
147 stolen = 0;
148
149 ticks = 0;
150 while (stolen >= NS_PER_TICK) {
151 ticks++;
152 stolen -= NS_PER_TICK;
153 }
154 __get_cpu_var(residual_stolen) = stolen;
155 account_steal_time(NULL, ticks);
156
157 /* Add the appropriate number of ticks of blocked time,
158 including any left-overs from last time. Passing idle to
159 account_steal_time accounts the time as idle/wait. */
160 blocked += __get_cpu_var(residual_blocked);
161
162 if (blocked < 0)
163 blocked = 0;
164
165 ticks = 0;
166 while (blocked >= NS_PER_TICK) {
167 ticks++;
168 blocked -= NS_PER_TICK;
169 }
170 __get_cpu_var(residual_blocked) = blocked;
171 account_steal_time(idle_task(smp_processor_id()), ticks);
172}
173
Jeremy Fitzhardingeab550282007-07-17 18:37:05 -0700174/*
175 * Xen sched_clock implementation. Returns the number of unstolen
176 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
177 * states.
178 */
179unsigned long long xen_sched_clock(void)
180{
181 struct vcpu_runstate_info state;
182 cycle_t now = xen_clocksource_read();
183 s64 offset;
184
185 get_runstate_snapshot(&state);
186
187 WARN_ON(state.state != RUNSTATE_running);
188
189 offset = now - state.state_entry_time;
190 if (offset < 0)
191 offset = 0;
192
193 return state.time[RUNSTATE_blocked] +
194 state.time[RUNSTATE_running] +
195 offset;
196}
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700197
198
199/* Get the CPU speed from Xen */
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700200unsigned long xen_cpu_khz(void)
201{
202 u64 cpu_khz = 1000000ULL << 32;
203 const struct vcpu_time_info *info =
204 &HYPERVISOR_shared_info->vcpu_info[0].time;
205
206 do_div(cpu_khz, info->tsc_to_system_mul);
207 if (info->tsc_shift < 0)
208 cpu_khz <<= -info->tsc_shift;
209 else
210 cpu_khz >>= info->tsc_shift;
211
212 return cpu_khz;
213}
214
215/*
216 * Reads a consistent set of time-base values from Xen, into a shadow data
217 * area.
218 */
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700219static unsigned get_time_values_from_xen(void)
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700220{
221 struct vcpu_time_info *src;
222 struct shadow_time_info *dst;
223
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700224 /* src is shared memory with the hypervisor, so we need to
225 make sure we get a consistent snapshot, even in the face of
226 being preempted. */
227 src = &__get_cpu_var(xen_vcpu)->time;
228 dst = &__get_cpu_var(shadow_time);
229
230 do {
231 dst->version = src->version;
232 rmb(); /* fetch version before data */
233 dst->tsc_timestamp = src->tsc_timestamp;
234 dst->system_timestamp = src->system_time;
235 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
236 dst->tsc_shift = src->tsc_shift;
237 rmb(); /* test version after fetching data */
238 } while ((src->version & 1) | (dst->version ^ src->version));
239
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700240 return dst->version;
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700241}
242
243/*
244 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
245 * yielding a 64-bit result.
246 */
247static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
248{
249 u64 product;
250#ifdef __i386__
251 u32 tmp1, tmp2;
252#endif
253
254 if (shift < 0)
255 delta >>= -shift;
256 else
257 delta <<= shift;
258
259#ifdef __i386__
260 __asm__ (
261 "mul %5 ; "
262 "mov %4,%%eax ; "
263 "mov %%edx,%4 ; "
264 "mul %5 ; "
265 "xor %5,%5 ; "
266 "add %4,%%eax ; "
267 "adc %5,%%edx ; "
268 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
269 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
270#elif __x86_64__
271 __asm__ (
272 "mul %%rdx ; shrd $32,%%rdx,%%rax"
273 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
274#else
275#error implement me!
276#endif
277
278 return product;
279}
280
281static u64 get_nsec_offset(struct shadow_time_info *shadow)
282{
283 u64 now, delta;
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700284 now = native_read_tsc();
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700285 delta = now - shadow->tsc_timestamp;
286 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
287}
288
Jeremy Fitzhardingeab550282007-07-17 18:37:05 -0700289static cycle_t xen_clocksource_read(void)
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700290{
291 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
292 cycle_t ret;
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700293 unsigned version;
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700294
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700295 do {
296 version = get_time_values_from_xen();
297 barrier();
298 ret = shadow->system_timestamp + get_nsec_offset(shadow);
299 barrier();
300 } while (version != __get_cpu_var(xen_vcpu)->time.version);
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700301
302 put_cpu_var(shadow_time);
303
304 return ret;
305}
306
307static void xen_read_wallclock(struct timespec *ts)
308{
309 const struct shared_info *s = HYPERVISOR_shared_info;
310 u32 version;
311 u64 delta;
312 struct timespec now;
313
314 /* get wallclock at system boot */
315 do {
316 version = s->wc_version;
317 rmb(); /* fetch version before time */
318 now.tv_sec = s->wc_sec;
319 now.tv_nsec = s->wc_nsec;
320 rmb(); /* fetch time before checking version */
321 } while ((s->wc_version & 1) | (version ^ s->wc_version));
322
323 delta = xen_clocksource_read(); /* time since system boot */
324 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
325
326 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
327 now.tv_sec = delta;
328
329 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
330}
331
332unsigned long xen_get_wallclock(void)
333{
334 struct timespec ts;
335
336 xen_read_wallclock(&ts);
337
338 return ts.tv_sec;
339}
340
341int xen_set_wallclock(unsigned long now)
342{
343 /* do nothing for domU */
344 return -1;
345}
346
347static struct clocksource xen_clocksource __read_mostly = {
348 .name = "xen",
349 .rating = 400,
350 .read = xen_clocksource_read,
351 .mask = ~0,
352 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
353 .shift = XEN_SHIFT,
354 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
355};
356
357/*
358 Xen clockevent implementation
359
360 Xen has two clockevent implementations:
361
362 The old timer_op one works with all released versions of Xen prior
363 to version 3.0.4. This version of the hypervisor provides a
364 single-shot timer with nanosecond resolution. However, sharing the
365 same event channel is a 100Hz tick which is delivered while the
366 vcpu is running. We don't care about or use this tick, but it will
367 cause the core time code to think the timer fired too soon, and
368 will end up resetting it each time. It could be filtered, but
369 doing so has complications when the ktime clocksource is not yet
370 the xen clocksource (ie, at boot time).
371
372 The new vcpu_op-based timer interface allows the tick timer period
373 to be changed or turned off. The tick timer is not useful as a
374 periodic timer because events are only delivered to running vcpus.
375 The one-shot timer can report when a timeout is in the past, so
376 set_next_event is capable of returning -ETIME when appropriate.
377 This interface is used when available.
378*/
379
380
381/*
382 Get a hypervisor absolute time. In theory we could maintain an
383 offset between the kernel's time and the hypervisor's time, and
384 apply that to a kernel's absolute timeout. Unfortunately the
385 hypervisor and kernel times can drift even if the kernel is using
386 the Xen clocksource, because ntp can warp the kernel's clocksource.
387*/
388static s64 get_abs_timeout(unsigned long delta)
389{
390 return xen_clocksource_read() + delta;
391}
392
393static void xen_timerop_set_mode(enum clock_event_mode mode,
394 struct clock_event_device *evt)
395{
396 switch (mode) {
397 case CLOCK_EVT_MODE_PERIODIC:
398 /* unsupported */
399 WARN_ON(1);
400 break;
401
402 case CLOCK_EVT_MODE_ONESHOT:
403 break;
404
405 case CLOCK_EVT_MODE_UNUSED:
406 case CLOCK_EVT_MODE_SHUTDOWN:
407 HYPERVISOR_set_timer_op(0); /* cancel timeout */
408 break;
409 }
410}
411
412static int xen_timerop_set_next_event(unsigned long delta,
413 struct clock_event_device *evt)
414{
415 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
416
417 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
418 BUG();
419
420 /* We may have missed the deadline, but there's no real way of
421 knowing for sure. If the event was in the past, then we'll
422 get an immediate interrupt. */
423
424 return 0;
425}
426
427static const struct clock_event_device xen_timerop_clockevent = {
428 .name = "xen",
429 .features = CLOCK_EVT_FEAT_ONESHOT,
430
431 .max_delta_ns = 0xffffffff,
432 .min_delta_ns = TIMER_SLOP,
433
434 .mult = 1,
435 .shift = 0,
436 .rating = 500,
437
438 .set_mode = xen_timerop_set_mode,
439 .set_next_event = xen_timerop_set_next_event,
440};
441
442
443
444static void xen_vcpuop_set_mode(enum clock_event_mode mode,
445 struct clock_event_device *evt)
446{
447 int cpu = smp_processor_id();
448
449 switch (mode) {
450 case CLOCK_EVT_MODE_PERIODIC:
451 WARN_ON(1); /* unsupported */
452 break;
453
454 case CLOCK_EVT_MODE_ONESHOT:
455 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
456 BUG();
457 break;
458
459 case CLOCK_EVT_MODE_UNUSED:
460 case CLOCK_EVT_MODE_SHUTDOWN:
461 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
462 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
463 BUG();
464 break;
465 }
466}
467
468static int xen_vcpuop_set_next_event(unsigned long delta,
469 struct clock_event_device *evt)
470{
471 int cpu = smp_processor_id();
472 struct vcpu_set_singleshot_timer single;
473 int ret;
474
475 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
476
477 single.timeout_abs_ns = get_abs_timeout(delta);
478 single.flags = VCPU_SSHOTTMR_future;
479
480 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
481
482 BUG_ON(ret != 0 && ret != -ETIME);
483
484 return ret;
485}
486
487static const struct clock_event_device xen_vcpuop_clockevent = {
488 .name = "xen",
489 .features = CLOCK_EVT_FEAT_ONESHOT,
490
491 .max_delta_ns = 0xffffffff,
492 .min_delta_ns = TIMER_SLOP,
493
494 .mult = 1,
495 .shift = 0,
496 .rating = 500,
497
498 .set_mode = xen_vcpuop_set_mode,
499 .set_next_event = xen_vcpuop_set_next_event,
500};
501
502static const struct clock_event_device *xen_clockevent =
503 &xen_timerop_clockevent;
504static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
505
506static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
507{
508 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
509 irqreturn_t ret;
510
511 ret = IRQ_NONE;
512 if (evt->event_handler) {
513 evt->event_handler(evt);
514 ret = IRQ_HANDLED;
515 }
516
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700517 do_stolen_accounting();
518
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700519 return ret;
520}
521
522static void xen_setup_timer(int cpu)
523{
524 const char *name;
525 struct clock_event_device *evt;
526 int irq;
527
528 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
529
530 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
531 if (!name)
532 name = "<timer kasprintf failed>";
533
534 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
535 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
536 name, NULL);
537
538 evt = &get_cpu_var(xen_clock_events);
539 memcpy(evt, xen_clockevent, sizeof(*evt));
540
541 evt->cpumask = cpumask_of_cpu(cpu);
542 evt->irq = irq;
543 clockevents_register_device(evt);
544
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700545 setup_runstate_info(cpu);
546
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700547 put_cpu_var(xen_clock_events);
548}
549
550__init void xen_time_init(void)
551{
552 int cpu = smp_processor_id();
553
554 get_time_values_from_xen();
555
556 clocksource_register(&xen_clocksource);
557
558 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700559 /* Successfully turned off 100Hz tick, so we have the
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700560 vcpuop-based timer interface */
561 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
562 xen_clockevent = &xen_vcpuop_clockevent;
563 }
564
565 /* Set initial system time with full resolution */
566 xen_read_wallclock(&xtime);
567 set_normalized_timespec(&wall_to_monotonic,
568 -xtime.tv_sec, -xtime.tv_nsec);
569
570 tsc_disable = 0;
571
572 xen_setup_timer(cpu);
573}