blob: acbfd99694625ee7c454bc9d2cc78cfb8d2f8cc6 [file] [log] [blame]
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -07001/*
2 * Xen time implementation.
3 *
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
7 *
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10#include <linux/kernel.h>
11#include <linux/interrupt.h>
12#include <linux/clocksource.h>
13#include <linux/clockchips.h>
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070014#include <linux/kernel_stat.h>
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -070015
16#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h>
18
19#include <xen/events.h>
20#include <xen/interface/xen.h>
21#include <xen/interface/vcpu.h>
22
23#include "xen-ops.h"
24
25#define XEN_SHIFT 22
26
27/* Xen may fire a timer up to this many ns early */
28#define TIMER_SLOP 100000
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070029#define NS_PER_TICK (1000000000LL / HZ)
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -070030
31/* These are perodically updated in shared_info, and then copied here. */
32struct shadow_time_info {
33 u64 tsc_timestamp; /* TSC at last update of time vals. */
34 u64 system_timestamp; /* Time, in nanosecs, since boot. */
35 u32 tsc_to_nsec_mul;
36 int tsc_shift;
37 u32 version;
38};
39
40static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
41
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -070042/* runstate info updated by Xen */
43static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
44
45/* snapshots of runstate info */
46static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
47
48/* unused ns of stolen and blocked time */
49static DEFINE_PER_CPU(u64, residual_stolen);
50static DEFINE_PER_CPU(u64, residual_blocked);
51
52/* return an consistent snapshot of 64-bit time/counter value */
53static u64 get64(const u64 *p)
54{
55 u64 ret;
56
57 if (BITS_PER_LONG < 64) {
58 u32 *p32 = (u32 *)p;
59 u32 h, l;
60
61 /*
62 * Read high then low, and then make sure high is
63 * still the same; this will only loop if low wraps
64 * and carries into high.
65 * XXX some clean way to make this endian-proof?
66 */
67 do {
68 h = p32[1];
69 barrier();
70 l = p32[0];
71 barrier();
72 } while (p32[1] != h);
73
74 ret = (((u64)h) << 32) | l;
75 } else
76 ret = *p;
77
78 return ret;
79}
80
81/*
82 * Runstate accounting
83 */
84static void get_runstate_snapshot(struct vcpu_runstate_info *res)
85{
86 u64 state_time;
87 struct vcpu_runstate_info *state;
88
89 preempt_disable();
90
91 state = &__get_cpu_var(runstate);
92
93 /*
94 * The runstate info is always updated by the hypervisor on
95 * the current CPU, so there's no need to use anything
96 * stronger than a compiler barrier when fetching it.
97 */
98 do {
99 state_time = get64(&state->state_entry_time);
100 barrier();
101 *res = *state;
102 barrier();
103 } while (get64(&state->state_entry_time) != state_time);
104
105 preempt_enable();
106}
107
108static void setup_runstate_info(int cpu)
109{
110 struct vcpu_register_runstate_memory_area area;
111
112 area.addr.v = &per_cpu(runstate, cpu);
113
114 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 cpu, &area))
116 BUG();
117}
118
119static void do_stolen_accounting(void)
120{
121 struct vcpu_runstate_info state;
122 struct vcpu_runstate_info *snap;
123 s64 blocked, runnable, offline, stolen;
124 cputime_t ticks;
125
126 get_runstate_snapshot(&state);
127
128 WARN_ON(state.state != RUNSTATE_running);
129
130 snap = &__get_cpu_var(runstate_snapshot);
131
132 /* work out how much time the VCPU has not been runn*ing* */
133 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
134 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
135 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
136
137 *snap = state;
138
139 /* Add the appropriate number of ticks of stolen time,
140 including any left-overs from last time. Passing NULL to
141 account_steal_time accounts the time as stolen. */
142 stolen = runnable + offline + __get_cpu_var(residual_stolen);
143
144 if (stolen < 0)
145 stolen = 0;
146
147 ticks = 0;
148 while (stolen >= NS_PER_TICK) {
149 ticks++;
150 stolen -= NS_PER_TICK;
151 }
152 __get_cpu_var(residual_stolen) = stolen;
153 account_steal_time(NULL, ticks);
154
155 /* Add the appropriate number of ticks of blocked time,
156 including any left-overs from last time. Passing idle to
157 account_steal_time accounts the time as idle/wait. */
158 blocked += __get_cpu_var(residual_blocked);
159
160 if (blocked < 0)
161 blocked = 0;
162
163 ticks = 0;
164 while (blocked >= NS_PER_TICK) {
165 ticks++;
166 blocked -= NS_PER_TICK;
167 }
168 __get_cpu_var(residual_blocked) = blocked;
169 account_steal_time(idle_task(smp_processor_id()), ticks);
170}
171
172
173
174/* Get the CPU speed from Xen */
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700175unsigned long xen_cpu_khz(void)
176{
177 u64 cpu_khz = 1000000ULL << 32;
178 const struct vcpu_time_info *info =
179 &HYPERVISOR_shared_info->vcpu_info[0].time;
180
181 do_div(cpu_khz, info->tsc_to_system_mul);
182 if (info->tsc_shift < 0)
183 cpu_khz <<= -info->tsc_shift;
184 else
185 cpu_khz >>= info->tsc_shift;
186
187 return cpu_khz;
188}
189
190/*
191 * Reads a consistent set of time-base values from Xen, into a shadow data
192 * area.
193 */
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700194static unsigned get_time_values_from_xen(void)
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700195{
196 struct vcpu_time_info *src;
197 struct shadow_time_info *dst;
198
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700199 /* src is shared memory with the hypervisor, so we need to
200 make sure we get a consistent snapshot, even in the face of
201 being preempted. */
202 src = &__get_cpu_var(xen_vcpu)->time;
203 dst = &__get_cpu_var(shadow_time);
204
205 do {
206 dst->version = src->version;
207 rmb(); /* fetch version before data */
208 dst->tsc_timestamp = src->tsc_timestamp;
209 dst->system_timestamp = src->system_time;
210 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
211 dst->tsc_shift = src->tsc_shift;
212 rmb(); /* test version after fetching data */
213 } while ((src->version & 1) | (dst->version ^ src->version));
214
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700215 return dst->version;
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700216}
217
218/*
219 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
220 * yielding a 64-bit result.
221 */
222static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
223{
224 u64 product;
225#ifdef __i386__
226 u32 tmp1, tmp2;
227#endif
228
229 if (shift < 0)
230 delta >>= -shift;
231 else
232 delta <<= shift;
233
234#ifdef __i386__
235 __asm__ (
236 "mul %5 ; "
237 "mov %4,%%eax ; "
238 "mov %%edx,%4 ; "
239 "mul %5 ; "
240 "xor %5,%5 ; "
241 "add %4,%%eax ; "
242 "adc %5,%%edx ; "
243 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
244 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
245#elif __x86_64__
246 __asm__ (
247 "mul %%rdx ; shrd $32,%%rdx,%%rax"
248 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
249#else
250#error implement me!
251#endif
252
253 return product;
254}
255
256static u64 get_nsec_offset(struct shadow_time_info *shadow)
257{
258 u64 now, delta;
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700259 now = native_read_tsc();
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700260 delta = now - shadow->tsc_timestamp;
261 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
262}
263
264cycle_t xen_clocksource_read(void)
265{
266 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
267 cycle_t ret;
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700268 unsigned version;
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700269
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700270 do {
271 version = get_time_values_from_xen();
272 barrier();
273 ret = shadow->system_timestamp + get_nsec_offset(shadow);
274 barrier();
275 } while (version != __get_cpu_var(xen_vcpu)->time.version);
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700276
277 put_cpu_var(shadow_time);
278
279 return ret;
280}
281
282static void xen_read_wallclock(struct timespec *ts)
283{
284 const struct shared_info *s = HYPERVISOR_shared_info;
285 u32 version;
286 u64 delta;
287 struct timespec now;
288
289 /* get wallclock at system boot */
290 do {
291 version = s->wc_version;
292 rmb(); /* fetch version before time */
293 now.tv_sec = s->wc_sec;
294 now.tv_nsec = s->wc_nsec;
295 rmb(); /* fetch time before checking version */
296 } while ((s->wc_version & 1) | (version ^ s->wc_version));
297
298 delta = xen_clocksource_read(); /* time since system boot */
299 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
300
301 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
302 now.tv_sec = delta;
303
304 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
305}
306
307unsigned long xen_get_wallclock(void)
308{
309 struct timespec ts;
310
311 xen_read_wallclock(&ts);
312
313 return ts.tv_sec;
314}
315
316int xen_set_wallclock(unsigned long now)
317{
318 /* do nothing for domU */
319 return -1;
320}
321
322static struct clocksource xen_clocksource __read_mostly = {
323 .name = "xen",
324 .rating = 400,
325 .read = xen_clocksource_read,
326 .mask = ~0,
327 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
328 .shift = XEN_SHIFT,
329 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
330};
331
332/*
333 Xen clockevent implementation
334
335 Xen has two clockevent implementations:
336
337 The old timer_op one works with all released versions of Xen prior
338 to version 3.0.4. This version of the hypervisor provides a
339 single-shot timer with nanosecond resolution. However, sharing the
340 same event channel is a 100Hz tick which is delivered while the
341 vcpu is running. We don't care about or use this tick, but it will
342 cause the core time code to think the timer fired too soon, and
343 will end up resetting it each time. It could be filtered, but
344 doing so has complications when the ktime clocksource is not yet
345 the xen clocksource (ie, at boot time).
346
347 The new vcpu_op-based timer interface allows the tick timer period
348 to be changed or turned off. The tick timer is not useful as a
349 periodic timer because events are only delivered to running vcpus.
350 The one-shot timer can report when a timeout is in the past, so
351 set_next_event is capable of returning -ETIME when appropriate.
352 This interface is used when available.
353*/
354
355
356/*
357 Get a hypervisor absolute time. In theory we could maintain an
358 offset between the kernel's time and the hypervisor's time, and
359 apply that to a kernel's absolute timeout. Unfortunately the
360 hypervisor and kernel times can drift even if the kernel is using
361 the Xen clocksource, because ntp can warp the kernel's clocksource.
362*/
363static s64 get_abs_timeout(unsigned long delta)
364{
365 return xen_clocksource_read() + delta;
366}
367
368static void xen_timerop_set_mode(enum clock_event_mode mode,
369 struct clock_event_device *evt)
370{
371 switch (mode) {
372 case CLOCK_EVT_MODE_PERIODIC:
373 /* unsupported */
374 WARN_ON(1);
375 break;
376
377 case CLOCK_EVT_MODE_ONESHOT:
378 break;
379
380 case CLOCK_EVT_MODE_UNUSED:
381 case CLOCK_EVT_MODE_SHUTDOWN:
382 HYPERVISOR_set_timer_op(0); /* cancel timeout */
383 break;
384 }
385}
386
387static int xen_timerop_set_next_event(unsigned long delta,
388 struct clock_event_device *evt)
389{
390 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
391
392 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
393 BUG();
394
395 /* We may have missed the deadline, but there's no real way of
396 knowing for sure. If the event was in the past, then we'll
397 get an immediate interrupt. */
398
399 return 0;
400}
401
402static const struct clock_event_device xen_timerop_clockevent = {
403 .name = "xen",
404 .features = CLOCK_EVT_FEAT_ONESHOT,
405
406 .max_delta_ns = 0xffffffff,
407 .min_delta_ns = TIMER_SLOP,
408
409 .mult = 1,
410 .shift = 0,
411 .rating = 500,
412
413 .set_mode = xen_timerop_set_mode,
414 .set_next_event = xen_timerop_set_next_event,
415};
416
417
418
419static void xen_vcpuop_set_mode(enum clock_event_mode mode,
420 struct clock_event_device *evt)
421{
422 int cpu = smp_processor_id();
423
424 switch (mode) {
425 case CLOCK_EVT_MODE_PERIODIC:
426 WARN_ON(1); /* unsupported */
427 break;
428
429 case CLOCK_EVT_MODE_ONESHOT:
430 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
431 BUG();
432 break;
433
434 case CLOCK_EVT_MODE_UNUSED:
435 case CLOCK_EVT_MODE_SHUTDOWN:
436 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
437 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
438 BUG();
439 break;
440 }
441}
442
443static int xen_vcpuop_set_next_event(unsigned long delta,
444 struct clock_event_device *evt)
445{
446 int cpu = smp_processor_id();
447 struct vcpu_set_singleshot_timer single;
448 int ret;
449
450 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
451
452 single.timeout_abs_ns = get_abs_timeout(delta);
453 single.flags = VCPU_SSHOTTMR_future;
454
455 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
456
457 BUG_ON(ret != 0 && ret != -ETIME);
458
459 return ret;
460}
461
462static const struct clock_event_device xen_vcpuop_clockevent = {
463 .name = "xen",
464 .features = CLOCK_EVT_FEAT_ONESHOT,
465
466 .max_delta_ns = 0xffffffff,
467 .min_delta_ns = TIMER_SLOP,
468
469 .mult = 1,
470 .shift = 0,
471 .rating = 500,
472
473 .set_mode = xen_vcpuop_set_mode,
474 .set_next_event = xen_vcpuop_set_next_event,
475};
476
477static const struct clock_event_device *xen_clockevent =
478 &xen_timerop_clockevent;
479static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
480
481static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
482{
483 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
484 irqreturn_t ret;
485
486 ret = IRQ_NONE;
487 if (evt->event_handler) {
488 evt->event_handler(evt);
489 ret = IRQ_HANDLED;
490 }
491
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700492 do_stolen_accounting();
493
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700494 return ret;
495}
496
497static void xen_setup_timer(int cpu)
498{
499 const char *name;
500 struct clock_event_device *evt;
501 int irq;
502
503 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
504
505 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
506 if (!name)
507 name = "<timer kasprintf failed>";
508
509 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
510 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
511 name, NULL);
512
513 evt = &get_cpu_var(xen_clock_events);
514 memcpy(evt, xen_clockevent, sizeof(*evt));
515
516 evt->cpumask = cpumask_of_cpu(cpu);
517 evt->irq = irq;
518 clockevents_register_device(evt);
519
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700520 setup_runstate_info(cpu);
521
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700522 put_cpu_var(xen_clock_events);
523}
524
525__init void xen_time_init(void)
526{
527 int cpu = smp_processor_id();
528
529 get_time_values_from_xen();
530
531 clocksource_register(&xen_clocksource);
532
533 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
Jeremy Fitzhardingef91a8b42007-07-17 18:37:05 -0700534 /* Successfully turned off 100Hz tick, so we have the
Jeremy Fitzhardinge15c84732007-07-17 18:37:05 -0700535 vcpuop-based timer interface */
536 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
537 xen_clockevent = &xen_vcpuop_clockevent;
538 }
539
540 /* Set initial system time with full resolution */
541 xen_read_wallclock(&xtime);
542 set_normalized_timespec(&wall_to_monotonic,
543 -xtime.tv_sec, -xtime.tv_nsec);
544
545 tsc_disable = 0;
546
547 xen_setup_timer(cpu);
548}