blob: 38da2b2e9ff227a3c0f38192c563d3257f1ea79a [file] [log] [blame]
alandonovan40b4ab62019-04-03 16:41:37 -04001// Copyright 2019 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package starlark
6
7// This file defines a simple execution-time profiler for Starlark.
8// It measures the wall time spent executing Starlark code, and emits a
9// gzipped protocol message in pprof format (github.com/google/pprof).
10//
11// When profiling is enabled, the interpreter calls the profiler to
12// indicate the start and end of each "span" or time interval. A leaf
13// function (whether Go or Starlark) has a single span. A function that
14// calls another function has spans for each interval in which it is the
15// top of the stack. (A LOAD instruction also ends a span.)
16//
17// At the start of a span, the interpreter records the current time in
18// the thread's topmost frame. At the end of the span, it obtains the
19// time again and subtracts the span start time. The difference is added
20// to an accumulator variable in the thread. If the accumulator exceeds
21// some fixed quantum (10ms, say), the profiler records the current call
22// stack and sends it to the profiler goroutine, along with the number
23// of quanta, which are subtracted. For example, if the accumulator
24// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
25// which exceeeds 10ms. The profiler records a stack with the value 20ms
26// (2 quanta), and the accumulator is left with 8ms.
27//
28// The profiler goroutine converts the stacks into the pprof format and
29// emits a gzip-compressed protocol message to the designated output
30// file. We use a hand-written streaming proto encoder to avoid
31// dependencies on pprof and proto, and to avoid the need to
32// materialize the profile data structure in memory.
33//
34// A limitation of this profiler is that it measures wall time, which
35// does not necessarily correspond to CPU time. A CPU profiler requires
36// that only running (not runnable) threads are sampled; this is
37// commonly achieved by having the kernel deliver a (PROF) signal to an
38// arbitrary running thread, through setitimer(2). The CPU profiler in the
39// Go runtime uses this mechanism, but it is not possible for a Go
40// application to register a SIGPROF handler, nor is it possible for a
41// Go handler for some other signal to read the stack pointer of
42// the interrupted thread.
43//
44// Two caveats:
45// (1) it is tempting to send the leaf Frame directly to the profiler
46// goroutine instead of making a copy of the stack, since a Frame is a
47// spaghetti stack--a linked list. However, as soon as execution
48// resumes, the stack's Frame.pc values may be mutated, so Frames are
49// not safe to share with the asynchronous profiler goroutine.
50// (2) it is tempting to use Callables as keys in a map when tabulating
51// the pprof protocols's Function entities. However, we cannot assume
52// that Callables are valid map keys, and furthermore we must not
53// pin function values in memory indefinitely as this may cause lambda
54// values to keep their free variables live much longer than necessary.
55
56// TODO(adonovan):
57// - make Start/Stop fully thread-safe.
58// - fix the pc hack.
59// - experiment with other values of quantum.
60
61import (
62 "bufio"
63 "bytes"
64 "compress/gzip"
65 "encoding/binary"
66 "fmt"
67 "io"
68 "log"
69 "reflect"
70 "sync/atomic"
71 "time"
72 "unsafe"
73
74 "go.starlark.net/syntax"
75)
76
77// StartProfile enables time profiling of all Starlark threads,
78// and writes a profile in pprof format to w.
79// It must be followed by a call to StopProfiler to stop
80// the profiler and finalize the profile.
81//
82// StartProfile returns an error if profiling was already enabled.
83//
84// StartProfile must not be called concurrently with Starlark execution.
85func StartProfile(w io.Writer) error {
86 if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
87 return fmt.Errorf("profiler already running")
88 }
89
90 // TODO(adonovan): make the API fully concurrency-safe.
91 // The main challenge is racy reads/writes of profiler.events,
92 // and of send/close races on the channel it refers to.
93 // It's easy to solve them with a mutex but harder to do
94 // it efficiently.
95
96 profiler.events = make(chan *profEvent, 1)
97 profiler.done = make(chan error)
98
99 go profile(w)
100
101 return nil
102}
103
104// StopProfiler stops the profiler started by a prior call to
105// StartProfile and finalizes the profile. It returns an error if the
106// profile could not be completed.
107//
108// StopProfiler must not be called concurrently with Starlark execution.
109func StopProfile() error {
110 // Terminate the profiler goroutine and get its result.
111 close(profiler.events)
112 err := <-profiler.done
113
114 profiler.done = nil
115 profiler.events = nil
116 atomic.StoreUint32(&profiler.on, 0)
117
118 return err
119}
120
121// globals
122var profiler struct {
123 on uint32 // nonzero => profiler running
124 events chan *profEvent // profile events from interpreter threads
125 done chan error // indicates profiler goroutine is ready
126}
127
128func (thread *Thread) beginProfSpan() {
129 if profiler.events == nil {
130 return // profiling not enabled
131 }
132
alandonovand9868e92019-04-19 14:47:26 -0400133 thread.frameAt(0).spanStart = nanotime()
alandonovan40b4ab62019-04-03 16:41:37 -0400134}
135
136// TODO(adonovan): experiment with smaller values,
137// which trade space and time for greater precision.
138const quantum = 10 * time.Millisecond
139
140func (thread *Thread) endProfSpan() {
141 if profiler.events == nil {
142 return // profiling not enabled
143 }
144
145 // Add the span to the thread's accumulator.
alandonovand9868e92019-04-19 14:47:26 -0400146 thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
alandonovan40b4ab62019-04-03 16:41:37 -0400147 if thread.proftime < quantum {
148 return
149 }
150
151 // Only record complete quanta.
152 n := thread.proftime / quantum
153 thread.proftime -= n * quantum
154
155 // Copy the stack.
156 // (We can't save thread.frame because its pc will change.)
157 ev := &profEvent{
158 thread: thread,
159 time: n * quantum,
160 }
161 ev.stack = ev.stackSpace[:0]
alandonovand9868e92019-04-19 14:47:26 -0400162 for i := range thread.stack {
163 fr := thread.frameAt(i)
alandonovan40b4ab62019-04-03 16:41:37 -0400164 ev.stack = append(ev.stack, profFrame{
165 pos: fr.Position(),
166 fn: fr.Callable(),
167 pc: fr.pc,
168 })
169 }
170
171 profiler.events <- ev
172}
173
174type profEvent struct {
175 thread *Thread // currently unused
176 time time.Duration
177 stack []profFrame
178 stackSpace [8]profFrame // initial space for stack
179}
180
181type profFrame struct {
182 fn Callable // don't hold this live for too long (prevents GC of lambdas)
183 pc uint32 // program counter (Starlark frames only)
184 pos syntax.Position // position of pc within this frame
185}
186
187// profile is the profiler goroutine.
188// It runs until StopProfiler is called.
189func profile(w io.Writer) {
190 // Field numbers from pprof protocol.
191 // See https://github.com/google/pprof/blob/master/proto/profile.proto
192 const (
193 Profile_sample_type = 1 // repeated ValueType
194 Profile_sample = 2 // repeated Sample
195 Profile_mapping = 3 // repeated Mapping
196 Profile_location = 4 // repeated Location
197 Profile_function = 5 // repeated Function
198 Profile_string_table = 6 // repeated string
199 Profile_time_nanos = 9 // int64
200 Profile_duration_nanos = 10 // int64
201 Profile_period_type = 11 // ValueType
202 Profile_period = 12 // int64
203
204 ValueType_type = 1 // int64
205 ValueType_unit = 2 // int64
206
207 Sample_location_id = 1 // repeated uint64
208 Sample_value = 2 // repeated int64
209 Sample_label = 3 // repeated Label
210
211 Label_key = 1 // int64
212 Label_str = 2 // int64
213 Label_num = 3 // int64
214 Label_num_unit = 4 // int64
215
216 Location_id = 1 // uint64
217 Location_mapping_id = 2 // uint64
218 Location_address = 3 // uint64
219 Location_line = 4 // repeated Line
220
221 Line_function_id = 1 // uint64
222 Line_line = 2 // int64
223
224 Function_id = 1 // uint64
225 Function_name = 2 // int64
226 Function_system_name = 3 // int64
227 Function_filename = 4 // int64
228 Function_start_line = 5 // int64
229 )
230
231 bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
232 gz := gzip.NewWriter(bufw)
233 enc := protoEncoder{w: gz}
234
235 // strings
236 stringIndex := make(map[string]int64)
237 str := func(s string) int64 {
238 i, ok := stringIndex[s]
239 if !ok {
240 i = int64(len(stringIndex))
241 enc.string(Profile_string_table, s)
242 stringIndex[s] = i
243 }
244 return i
245 }
246 str("") // entry 0
247
248 // functions
249 //
250 // function returns the ID of a Callable for use in Line.FunctionId.
251 // The ID is the same as the function's logical address,
252 // which is supplied by the caller to avoid the need to recompute it.
253 functionId := make(map[uintptr]uint64)
254 function := func(fn Callable, addr uintptr) uint64 {
255 id, ok := functionId[addr]
256 if !ok {
257 id = uint64(addr)
258
259 var pos syntax.Position
260 if fn, ok := fn.(callableWithPosition); ok {
261 pos = fn.Position()
262 }
263
264 name := fn.Name()
265 if name == "<toplevel>" {
266 name = pos.Filename()
267 }
268
269 nameIndex := str(name)
270
271 fun := new(bytes.Buffer)
272 funenc := protoEncoder{w: fun}
273 funenc.uint(Function_id, id)
274 funenc.int(Function_name, nameIndex)
275 funenc.int(Function_system_name, nameIndex)
276 funenc.int(Function_filename, str(pos.Filename()))
277 funenc.int(Function_start_line, int64(pos.Line))
278 enc.bytes(Profile_function, fun.Bytes())
279
280 functionId[addr] = id
281 }
282 return id
283 }
284
285 // locations
286 //
287 // location returns the ID of the location denoted by fr.
288 // For Starlark frames, this is the Frame pc.
289 locationId := make(map[uintptr]uint64)
290 location := func(fr profFrame) uint64 {
291 fnAddr := profFuncAddr(fr.fn)
292
293 // For Starlark functions, the frame position
294 // represents the current PC value.
295 // Mix it into the low bits of the address.
296 // This is super hacky and may result in collisions
297 // in large functions or if functions are numerous.
298 // TODO(adonovan): fix: try making this cleaner by treating
299 // each bytecode segment as a Profile.Mapping.
300 pcAddr := fnAddr
301 if _, ok := fr.fn.(*Function); ok {
302 pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
303 }
304
305 id, ok := locationId[pcAddr]
306 if !ok {
307 id = uint64(pcAddr)
308
309 line := new(bytes.Buffer)
310 lineenc := protoEncoder{w: line}
311 lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
312 lineenc.int(Line_line, int64(fr.pos.Line))
313 loc := new(bytes.Buffer)
314 locenc := protoEncoder{w: loc}
315 locenc.uint(Location_id, id)
316 locenc.uint(Location_address, uint64(pcAddr))
317 locenc.bytes(Location_line, line.Bytes())
318 enc.bytes(Profile_location, loc.Bytes())
319
320 locationId[pcAddr] = id
321 }
322 return id
323 }
324
325 wallNanos := new(bytes.Buffer)
326 wnenc := protoEncoder{w: wallNanos}
327 wnenc.int(ValueType_type, str("wall"))
328 wnenc.int(ValueType_unit, str("nanoseconds"))
329
330 // informational fields of Profile
alandonovan61fc5422019-04-05 16:12:13 -0400331 enc.bytes(Profile_sample_type, wallNanos.Bytes())
332 enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period
333 enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period
334 enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile
alandonovan40b4ab62019-04-03 16:41:37 -0400335
alandonovan61fc5422019-04-05 16:12:13 -0400336 startNano := nanotime()
alandonovan40b4ab62019-04-03 16:41:37 -0400337
338 // Read profile events from the channel
339 // until it is closed by StopProfiler.
340 for e := range profiler.events {
341 sample := new(bytes.Buffer)
342 sampleenc := protoEncoder{w: sample}
343 sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
344 for _, fr := range e.stack {
345 sampleenc.uint(Sample_location_id, location(fr))
346 }
347 enc.bytes(Profile_sample, sample.Bytes())
348 }
349
alandonovan61fc5422019-04-05 16:12:13 -0400350 endNano := nanotime()
351 enc.int(Profile_duration_nanos, endNano-startNano)
alandonovan40b4ab62019-04-03 16:41:37 -0400352
353 err := gz.Close() // Close reports any prior write error
354 if flushErr := bufw.Flush(); err == nil {
355 err = flushErr
356 }
357 profiler.done <- err
358}
359
360// nanotime returns the time in nanoseconds since epoch.
361// It is implemented by runtime.nanotime using the linkname hack;
362// runtime.nanotime is defined for all OSs/ARCHS and uses the
alandonovan61fc5422019-04-05 16:12:13 -0400363// monotonic system clock, which there is no portable way to access.
364// Should that function ever go away, these alternatives exist:
alandonovan40b4ab62019-04-03 16:41:37 -0400365//
366// // POSIX only. REALTIME not MONOTONIC. 17ns.
367// var tv syscall.Timeval
368// syscall.Gettimeofday(&tv) // can't fail
369// return tv.Nano()
370//
371// // Portable. REALTIME not MONOTONIC. 46ns.
372// return time.Now().Nanoseconds()
373//
alandonovan61fc5422019-04-05 16:12:13 -0400374// // POSIX only. Adds a dependency.
375// import "golang.org/x/sys/unix"
376// var ts unix.Timespec
377// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
378// return unix.TimespecToNsec(ts)
379//
alandonovan40b4ab62019-04-03 16:41:37 -0400380//go:linkname nanotime runtime.nanotime
381func nanotime() int64
382
383// profFuncAddr returns the canonical "address"
384// of a Callable for use by the profiler.
385func profFuncAddr(fn Callable) uintptr {
386 switch fn := fn.(type) {
387 case *Builtin:
388 return reflect.ValueOf(fn.fn).Pointer()
389 case *Function:
390 return uintptr(unsafe.Pointer(fn.funcode))
391 }
392
393 // User-defined callable types are typically of
394 // of kind pointer-to-struct. Handle them specially.
395 if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
396 return v.Pointer()
397 }
398
399 // Address zero is reserved by the protocol.
400 // Use 1 for callables we don't recognize.
401 log.Printf("Starlark profiler: no address for Callable %T", fn)
402 return 1
403}
404
405// We encode the protocol message by hand to avoid making
406// the interpreter depend on both github.com/google/pprof
407// and github.com/golang/protobuf.
408//
409// This also avoids the need to materialize a protocol message object
410// tree of unbounded size and serialize it all at the end.
411// The pprof format appears to have been designed to
412// permit streaming implementations such as this one.
413//
414// See https://developers.google.com/protocol-buffers/docs/encoding.
415type protoEncoder struct {
416 w io.Writer // *bytes.Buffer or *gzip.Writer
417 tmp [binary.MaxVarintLen64]byte
418}
419
420func (e *protoEncoder) uvarint(x uint64) {
421 n := binary.PutUvarint(e.tmp[:], x)
422 e.w.Write(e.tmp[:n])
423}
424
425func (e *protoEncoder) tag(field, wire uint) {
426 e.uvarint(uint64(field<<3 | wire))
427}
428
429func (e *protoEncoder) string(field uint, s string) {
430 e.tag(field, 2) // length-delimited
431 e.uvarint(uint64(len(s)))
432 io.WriteString(e.w, s)
433}
434
435func (e *protoEncoder) bytes(field uint, b []byte) {
436 e.tag(field, 2) // length-delimited
437 e.uvarint(uint64(len(b)))
438 e.w.Write(b)
439}
440
441func (e *protoEncoder) uint(field uint, x uint64) {
442 e.tag(field, 0) // varint
443 e.uvarint(x)
444}
445
446func (e *protoEncoder) int(field uint, x int64) {
447 e.tag(field, 0) // varint
448 e.uvarint(uint64(x))
449}