Merge pull request #3754 from ctiller/latent-see

Latency profiling support
diff --git a/.gitignore b/.gitignore
index 18b4281..bfbc393 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,10 +37,11 @@
 
 # Temporary test reports
 report.xml
+latency_trace.txt
 
 # port server log
 portlog.txt
 
 # gyp generated make files
 *-gyp.mk
-out
\ No newline at end of file
+out
diff --git a/BUILD b/BUILD
index ffeb31e..262b2c6 100644
--- a/BUILD
+++ b/BUILD
@@ -44,6 +44,7 @@
 cc_library(
   name = "gpr",
   srcs = [
+    "src/core/profiling/timers.h",
     "src/core/support/block_annotate.h",
     "src/core/support/env.h",
     "src/core/support/file.h",
@@ -53,6 +54,8 @@
     "src/core/support/string_win32.h",
     "src/core/support/thd_internal.h",
     "src/core/support/time_precise.h",
+    "src/core/profiling/basic_timers.c",
+    "src/core/profiling/stap_timers.c",
     "src/core/support/alloc.c",
     "src/core/support/cmdline.c",
     "src/core/support/cpu_iphone.c",
@@ -88,6 +91,7 @@
     "src/core/support/thd_win32.c",
     "src/core/support/time.c",
     "src/core/support/time_posix.c",
+    "src/core/support/time_precise.c",
     "src/core/support/time_win32.c",
     "src/core/support/tls_pthread.c",
   ],
@@ -217,7 +221,6 @@
     "src/core/json/json_common.h",
     "src/core/json/json_reader.h",
     "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
     "src/core/statistics/census_interface.h",
     "src/core/statistics/census_rpc_stats.h",
     "src/core/surface/api_trace.h",
@@ -356,8 +359,6 @@
     "src/core/json/json_reader.c",
     "src/core/json/json_string.c",
     "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
     "src/core/surface/api_trace.c",
     "src/core/surface/byte_buffer.c",
     "src/core/surface/byte_buffer_queue.c",
@@ -504,7 +505,6 @@
     "src/core/json/json_common.h",
     "src/core/json/json_reader.h",
     "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
     "src/core/statistics/census_interface.h",
     "src/core/statistics/census_rpc_stats.h",
     "src/core/surface/api_trace.h",
@@ -623,8 +623,6 @@
     "src/core/json/json_reader.c",
     "src/core/json/json_string.c",
     "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
     "src/core/surface/api_trace.c",
     "src/core/surface/byte_buffer.c",
     "src/core/surface/byte_buffer_queue.c",
@@ -962,6 +960,8 @@
 objc_library(
   name = "gpr_objc",
   srcs = [
+    "src/core/profiling/basic_timers.c",
+    "src/core/profiling/stap_timers.c",
     "src/core/support/alloc.c",
     "src/core/support/cmdline.c",
     "src/core/support/cpu_iphone.c",
@@ -997,6 +997,7 @@
     "src/core/support/thd_win32.c",
     "src/core/support/time.c",
     "src/core/support/time_posix.c",
+    "src/core/support/time_precise.c",
     "src/core/support/time_win32.c",
     "src/core/support/tls_pthread.c",
   ],
@@ -1028,6 +1029,7 @@
     "include/grpc/support/tls_msvc.h",
     "include/grpc/support/tls_pthread.h",
     "include/grpc/support/useful.h",
+    "src/core/profiling/timers.h",
     "src/core/support/block_annotate.h",
     "src/core/support/env.h",
     "src/core/support/file.h",
@@ -1147,8 +1149,6 @@
     "src/core/json/json_reader.c",
     "src/core/json/json_string.c",
     "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
     "src/core/surface/api_trace.c",
     "src/core/surface/byte_buffer.c",
     "src/core/surface/byte_buffer_queue.c",
@@ -1292,7 +1292,6 @@
     "src/core/json/json_common.h",
     "src/core/json/json_reader.h",
     "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
     "src/core/statistics/census_interface.h",
     "src/core/statistics/census_rpc_stats.h",
     "src/core/surface/api_trace.h",
diff --git a/Makefile b/Makefile
index 8eb94f7..788fdb5 100644
--- a/Makefile
+++ b/Makefile
@@ -3893,6 +3893,8 @@
 
 
 LIBGPR_SRC = \
+    src/core/profiling/basic_timers.c \
+    src/core/profiling/stap_timers.c \
     src/core/support/alloc.c \
     src/core/support/cmdline.c \
     src/core/support/cpu_iphone.c \
@@ -3928,6 +3930,7 @@
     src/core/support/thd_win32.c \
     src/core/support/time.c \
     src/core/support/time_posix.c \
+    src/core/support/time_precise.c \
     src/core/support/time_win32.c \
     src/core/support/tls_pthread.c \
 
@@ -4117,8 +4120,6 @@
     src/core/json/json_reader.c \
     src/core/json/json_string.c \
     src/core/json/json_writer.c \
-    src/core/profiling/basic_timers.c \
-    src/core/profiling/stap_timers.c \
     src/core/surface/api_trace.c \
     src/core/surface/byte_buffer.c \
     src/core/surface/byte_buffer_queue.c \
@@ -4399,8 +4400,6 @@
     src/core/json/json_reader.c \
     src/core/json/json_string.c \
     src/core/json/json_writer.c \
-    src/core/profiling/basic_timers.c \
-    src/core/profiling/stap_timers.c \
     src/core/surface/api_trace.c \
     src/core/surface/byte_buffer.c \
     src/core/surface/byte_buffer_queue.c \
diff --git a/binding.gyp b/binding.gyp
index 392835c..39036b3 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -64,6 +64,8 @@
       'dependencies': [
       ],
       'sources': [
+        'src/core/profiling/basic_timers.c',
+        'src/core/profiling/stap_timers.c',
         'src/core/support/alloc.c',
         'src/core/support/cmdline.c',
         'src/core/support/cpu_iphone.c',
@@ -99,6 +101,7 @@
         'src/core/support/thd_win32.c',
         'src/core/support/time.c',
         'src/core/support/time_posix.c',
+        'src/core/support/time_precise.c',
         'src/core/support/time_win32.c',
         'src/core/support/tls_pthread.c',
       ],
@@ -208,8 +211,6 @@
         'src/core/json/json_reader.c',
         'src/core/json/json_string.c',
         'src/core/json/json_writer.c',
-        'src/core/profiling/basic_timers.c',
-        'src/core/profiling/stap_timers.c',
         'src/core/surface/api_trace.c',
         'src/core/surface/byte_buffer.c',
         'src/core/surface/byte_buffer_queue.c',
diff --git a/build.yaml b/build.yaml
index e277b60..9fd667d 100644
--- a/build.yaml
+++ b/build.yaml
@@ -177,7 +177,6 @@
   - src/core/json/json_common.h
   - src/core/json/json_reader.h
   - src/core/json/json_writer.h
-  - src/core/profiling/timers.h
   - src/core/statistics/census_interface.h
   - src/core/statistics/census_rpc_stats.h
   - src/core/surface/api_trace.h
@@ -293,8 +292,6 @@
   - src/core/json/json_reader.c
   - src/core/json/json_string.c
   - src/core/json/json_writer.c
-  - src/core/profiling/basic_timers.c
-  - src/core/profiling/stap_timers.c
   - src/core/surface/api_trace.c
   - src/core/surface/byte_buffer.c
   - src/core/surface/byte_buffer_queue.c
@@ -393,6 +390,7 @@
   - include/grpc/support/tls_pthread.h
   - include/grpc/support/useful.h
   headers:
+  - src/core/profiling/timers.h
   - src/core/support/block_annotate.h
   - src/core/support/env.h
   - src/core/support/file.h
@@ -403,6 +401,8 @@
   - src/core/support/thd_internal.h
   - src/core/support/time_precise.h
   src:
+  - src/core/profiling/basic_timers.c
+  - src/core/profiling/stap_timers.c
   - src/core/support/alloc.c
   - src/core/support/cmdline.c
   - src/core/support/cpu_iphone.c
@@ -438,6 +438,7 @@
   - src/core/support/thd_win32.c
   - src/core/support/time.c
   - src/core/support/time_posix.c
+  - src/core/support/time_precise.c
   - src/core/support/time_win32.c
   - src/core/support/tls_pthread.c
   secure: false
diff --git a/gRPC.podspec b/gRPC.podspec
index 6378cf0..66b49a8 100644
--- a/gRPC.podspec
+++ b/gRPC.podspec
@@ -63,7 +63,8 @@
 
   # Core cross-platform gRPC library, written in C.
   s.subspec 'C-Core' do |ss|
-    ss.source_files = 'src/core/support/block_annotate.h',
+    ss.source_files = 'src/core/profiling/timers.h',
+                      'src/core/support/block_annotate.h',
                       'src/core/support/env.h',
                       'src/core/support/file.h',
                       'src/core/support/murmur_hash.h',
@@ -99,6 +100,8 @@
                       'grpc/support/tls_msvc.h',
                       'grpc/support/tls_pthread.h',
                       'grpc/support/useful.h',
+                      'src/core/profiling/basic_timers.c',
+                      'src/core/profiling/stap_timers.c',
                       'src/core/support/alloc.c',
                       'src/core/support/cmdline.c',
                       'src/core/support/cpu_iphone.c',
@@ -134,6 +137,7 @@
                       'src/core/support/thd_win32.c',
                       'src/core/support/time.c',
                       'src/core/support/time_posix.c',
+                      'src/core/support/time_precise.c',
                       'src/core/support/time_win32.c',
                       'src/core/support/tls_pthread.c',
                       'src/core/security/auth_filters.h',
@@ -221,7 +225,6 @@
                       'src/core/json/json_common.h',
                       'src/core/json/json_reader.h',
                       'src/core/json/json_writer.h',
-                      'src/core/profiling/timers.h',
                       'src/core/statistics/census_interface.h',
                       'src/core/statistics/census_rpc_stats.h',
                       'src/core/surface/api_trace.h',
@@ -367,8 +370,6 @@
                       'src/core/json/json_reader.c',
                       'src/core/json/json_string.c',
                       'src/core/json/json_writer.c',
-                      'src/core/profiling/basic_timers.c',
-                      'src/core/profiling/stap_timers.c',
                       'src/core/surface/api_trace.c',
                       'src/core/surface/byte_buffer.c',
                       'src/core/surface/byte_buffer_queue.c',
@@ -419,7 +420,8 @@
                       'src/core/census/operation.c',
                       'src/core/census/tracing.c'
 
-    ss.private_header_files = 'src/core/support/block_annotate.h',
+    ss.private_header_files = 'src/core/profiling/timers.h',
+                              'src/core/support/block_annotate.h',
                               'src/core/support/env.h',
                               'src/core/support/file.h',
                               'src/core/support/murmur_hash.h',
@@ -513,7 +515,6 @@
                               'src/core/json/json_common.h',
                               'src/core/json/json_reader.h',
                               'src/core/json/json_writer.h',
-                              'src/core/profiling/timers.h',
                               'src/core/statistics/census_interface.h',
                               'src/core/statistics/census_rpc_stats.h',
                               'src/core/surface/api_trace.h',
diff --git a/src/core/channel/client_channel.c b/src/core/channel/client_channel.c
index ce8af96..9f85557 100644
--- a/src/core/channel/client_channel.c
+++ b/src/core/channel/client_channel.c
@@ -36,17 +36,19 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "src/core/channel/channel_args.h"
-#include "src/core/channel/connected_channel.h"
-#include "src/core/surface/channel.h"
-#include "src/core/iomgr/iomgr.h"
-#include "src/core/support/string.h"
-#include "src/core/transport/connectivity_state.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
 #include <grpc/support/sync.h>
 #include <grpc/support/useful.h>
 
+#include "src/core/channel/channel_args.h"
+#include "src/core/channel/connected_channel.h"
+#include "src/core/iomgr/iomgr.h"
+#include "src/core/profiling/timers.h"
+#include "src/core/support/string.h"
+#include "src/core/surface/channel.h"
+#include "src/core/transport/connectivity_state.h"
+
 /* Client channel implementation */
 
 typedef struct call_data call_data;
@@ -242,6 +244,8 @@
   grpc_pollset *pollset;
   grpc_subchannel_call_create_status call_creation_status;
 
+  GPR_TIMER_BEGIN("picked_target", 0);
+
   if (calld->picked_channel == NULL) {
     /* treat this like a cancellation */
     calld->waiting_op.cancel_with_status = GRPC_STATUS_UNAVAILABLE;
@@ -266,6 +270,8 @@
       }
     }
   }
+
+  GPR_TIMER_END("picked_target", 0);
 }
 
 static grpc_closure *merge_into_waiting_op(grpc_call_element *elem,
@@ -326,6 +332,7 @@
   grpc_subchannel_call *subchannel_call;
   grpc_lb_policy *lb_policy;
   grpc_transport_stream_op op2;
+  GPR_TIMER_BEGIN("perform_transport_stream_op", 0);
   GPR_ASSERT(elem->filter == &grpc_client_channel_filter);
   GRPC_CALL_LOG_OP(GPR_INFO, elem, op);
 
@@ -437,6 +444,8 @@
       }
       break;
   }
+
+  GPR_TIMER_END("perform_transport_stream_op", 0);
 }
 
 static void cc_start_transport_stream_op(grpc_exec_ctx *exec_ctx,
diff --git a/src/core/channel/compress_filter.c b/src/core/channel/compress_filter.c
index c32f150..20b5084 100644
--- a/src/core/channel/compress_filter.c
+++ b/src/core/channel/compress_filter.c
@@ -41,6 +41,7 @@
 
 #include "src/core/channel/compress_filter.h"
 #include "src/core/channel/channel_args.h"
+#include "src/core/profiling/timers.h"
 #include "src/core/compression/message_compress.h"
 #include "src/core/support/string.h"
 
@@ -271,10 +272,14 @@
 static void compress_start_transport_stream_op(grpc_exec_ctx *exec_ctx,
                                                grpc_call_element *elem,
                                                grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("compress_start_transport_stream_op", 0);
+
   if (op->send_ops && op->send_ops->nops > 0) {
     process_send_ops(elem, op->send_ops);
   }
 
+  GPR_TIMER_END("compress_start_transport_stream_op", 0);
+
   /* pass control down the stack */
   grpc_call_next_op(exec_ctx, elem, op);
 }
diff --git a/src/core/channel/connected_channel.c b/src/core/channel/connected_channel.c
index f9fc280..6d4d7be 100644
--- a/src/core/channel/connected_channel.c
+++ b/src/core/channel/connected_channel.c
@@ -39,6 +39,7 @@
 
 #include "src/core/support/string.h"
 #include "src/core/transport/transport.h"
+#include "src/core/profiling/timers.h"
 #include <grpc/byte_buffer.h>
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
diff --git a/src/core/channel/http_client_filter.c b/src/core/channel/http_client_filter.c
index d67dc37..f78a5cc 100644
--- a/src/core/channel/http_client_filter.c
+++ b/src/core/channel/http_client_filter.c
@@ -36,6 +36,7 @@
 #include <grpc/support/log.h>
 #include <grpc/support/string_util.h>
 #include "src/core/support/string.h"
+#include "src/core/profiling/timers.h"
 
 typedef struct call_data {
   grpc_linked_mdelem method;
@@ -162,8 +163,10 @@
 static void hc_start_transport_op(grpc_exec_ctx *exec_ctx,
                                   grpc_call_element *elem,
                                   grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("hc_start_transport_op", 0);
   GRPC_CALL_LOG_OP(GPR_INFO, elem, op);
   hc_mutate_op(elem, op);
+  GPR_TIMER_END("hc_start_transport_op", 0);
   grpc_call_next_op(exec_ctx, elem, op);
 }
 
diff --git a/src/core/channel/http_server_filter.c b/src/core/channel/http_server_filter.c
index 5e6d684..99e5066 100644
--- a/src/core/channel/http_server_filter.c
+++ b/src/core/channel/http_server_filter.c
@@ -36,6 +36,7 @@
 #include <string.h>
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
+#include "src/core/profiling/timers.h"
 
 typedef struct call_data {
   gpr_uint8 got_initial_metadata;
@@ -230,8 +231,10 @@
                                   grpc_call_element *elem,
                                   grpc_transport_stream_op *op) {
   GRPC_CALL_LOG_OP(GPR_INFO, elem, op);
+  GPR_TIMER_BEGIN("hs_start_transport_op", 0);
   hs_mutate_op(elem, op);
   grpc_call_next_op(exec_ctx, elem, op);
+  GPR_TIMER_END("hs_start_transport_op", 0);
 }
 
 /* Constructor for call_data */
diff --git a/src/core/client_config/lb_policies/pick_first.c b/src/core/client_config/lb_policies/pick_first.c
index 4b3aaab..e5bf068 100644
--- a/src/core/client_config/lb_policies/pick_first.c
+++ b/src/core/client_config/lb_policies/pick_first.c
@@ -175,7 +175,7 @@
   }
 }
 
-static void destroy_subchannels(grpc_exec_ctx *exec_ctx, void *arg, 
+static void destroy_subchannels(grpc_exec_ctx *exec_ctx, void *arg,
                                 int iomgr_success) {
   pick_first_lb_policy *p = arg;
   size_t i;
@@ -235,7 +235,8 @@
         GRPC_SUBCHANNEL_REF(p->selected, "picked_first");
         /* drop the pick list: we are connected now */
         GRPC_LB_POLICY_REF(&p->base, "destroy_subchannels");
-        grpc_exec_ctx_enqueue(exec_ctx, grpc_closure_create(destroy_subchannels, p), 1);
+        grpc_exec_ctx_enqueue(exec_ctx,
+                              grpc_closure_create(destroy_subchannels, p), 1);
         /* update any calls that were waiting for a pick */
         while ((pp = p->pending_picks)) {
           p->pending_picks = pp->next;
diff --git a/src/core/iomgr/exec_ctx.c b/src/core/iomgr/exec_ctx.c
index f2914d3..410b34c 100644
--- a/src/core/iomgr/exec_ctx.c
+++ b/src/core/iomgr/exec_ctx.c
@@ -35,18 +35,24 @@
 
 #include <grpc/support/log.h>
 
+#include "src/core/profiling/timers.h"
+
 int grpc_exec_ctx_flush(grpc_exec_ctx *exec_ctx) {
   int did_something = 0;
+  GPR_TIMER_BEGIN("grpc_exec_ctx_flush", 0);
   while (!grpc_closure_list_empty(exec_ctx->closure_list)) {
     grpc_closure *c = exec_ctx->closure_list.head;
     exec_ctx->closure_list.head = exec_ctx->closure_list.tail = NULL;
     while (c != NULL) {
       grpc_closure *next = c->next;
-      did_something = 1;
+      did_something++;
+      GPR_TIMER_BEGIN("grpc_exec_ctx_flush.cb", 0);
       c->cb(exec_ctx, c->cb_arg, c->success);
+      GPR_TIMER_END("grpc_exec_ctx_flush.cb", 0);
       c = next;
     }
   }
+  GPR_TIMER_END("grpc_exec_ctx_flush", 0);
   return did_something;
 }
 
diff --git a/src/core/iomgr/pollset_multipoller_with_epoll.c b/src/core/iomgr/pollset_multipoller_with_epoll.c
index ba9ba73..2aafd21 100644
--- a/src/core/iomgr/pollset_multipoller_with_epoll.c
+++ b/src/core/iomgr/pollset_multipoller_with_epoll.c
@@ -41,10 +41,11 @@
 #include <sys/epoll.h>
 #include <unistd.h>
 
-#include "src/core/iomgr/fd_posix.h"
-#include "src/core/support/block_annotate.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
+#include "src/core/iomgr/fd_posix.h"
+#include "src/core/support/block_annotate.h"
+#include "src/core/profiling/timers.h"
 
 typedef struct wakeup_fd_hdl {
   grpc_wakeup_fd wakeup_fd;
@@ -182,9 +183,11 @@
 
   /* TODO(vpai): Consider first doing a 0 timeout poll here to avoid
      even going into the blocking annotation if possible */
+  GPR_TIMER_BEGIN("poll", 0);
   GRPC_SCHEDULING_START_BLOCKING_REGION;
   poll_rv = grpc_poll_function(pfds, 2, timeout_ms);
   GRPC_SCHEDULING_END_BLOCKING_REGION;
+  GPR_TIMER_END("poll", 0);
 
   if (poll_rv < 0) {
     if (errno != EINTR) {
diff --git a/src/core/iomgr/pollset_posix.c b/src/core/iomgr/pollset_posix.c
index d056866..4d8bc53 100644
--- a/src/core/iomgr/pollset_posix.c
+++ b/src/core/iomgr/pollset_posix.c
@@ -101,9 +101,12 @@
 void grpc_pollset_kick_ext(grpc_pollset *p,
                            grpc_pollset_worker *specific_worker,
                            gpr_uint32 flags) {
+  GPR_TIMER_BEGIN("grpc_pollset_kick_ext", 0);
+
   /* pollset->mu already held */
   if (specific_worker != NULL) {
     if (specific_worker == GRPC_POLLSET_KICK_BROADCAST) {
+      GPR_TIMER_BEGIN("grpc_pollset_kick_ext.broadcast", 0);
       GPR_ASSERT((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) == 0);
       for (specific_worker = p->root_worker.next;
            specific_worker != &p->root_worker;
@@ -111,44 +114,50 @@
         grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
       }
       p->kicked_without_pollers = 1;
-      return;
+      GPR_TIMER_END("grpc_pollset_kick_ext.broadcast", 0);
     } else if (gpr_tls_get(&g_current_thread_worker) !=
                (gpr_intptr)specific_worker) {
+      GPR_TIMER_MARK("different_thread_worker", 0);
       if ((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) != 0) {
         specific_worker->reevaluate_polling_on_wakeup = 1;
       }
       grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
     } else if ((flags & GRPC_POLLSET_CAN_KICK_SELF) != 0) {
+      GPR_TIMER_MARK("kick_yoself", 0);
       if ((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) != 0) {
         specific_worker->reevaluate_polling_on_wakeup = 1;
       }
       grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
     }
   } else if (gpr_tls_get(&g_current_thread_poller) != (gpr_intptr)p) {
     GPR_ASSERT((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) == 0);
+    GPR_TIMER_MARK("kick_anonymous", 0);
     specific_worker = pop_front_worker(p);
     if (specific_worker != NULL) {
       if (gpr_tls_get(&g_current_thread_worker) ==
           (gpr_intptr)specific_worker) {
+        GPR_TIMER_MARK("kick_anonymous_not_self", 0);
         push_back_worker(p, specific_worker);
         specific_worker = pop_front_worker(p);
         if ((flags & GRPC_POLLSET_CAN_KICK_SELF) == 0 &&
             gpr_tls_get(&g_current_thread_worker) ==
                 (gpr_intptr)specific_worker) {
           push_back_worker(p, specific_worker);
-          return;
+          specific_worker = NULL;
         }
       }
-      push_back_worker(p, specific_worker);
-      grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
+      if (specific_worker != NULL) {
+        GPR_TIMER_MARK("finally_kick", 0);
+        push_back_worker(p, specific_worker);
+        grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
+      }
     } else {
+      GPR_TIMER_MARK("kicked_no_pollers", 0);
       p->kicked_without_pollers = 1;
-      return;
     }
   }
+
+  GPR_TIMER_END("grpc_pollset_kick_ext", 0);
 }
 
 void grpc_pollset_kick(grpc_pollset *p, grpc_pollset_worker *specific_worker) {
@@ -229,6 +238,7 @@
   int locked = 1;
   int queued_work = 0;
   int keep_polling = 0;
+  GPR_TIMER_BEGIN("grpc_pollset_work", 0);
   /* this must happen before we (potentially) drop pollset->mu */
   worker->next = worker->prev = NULL;
   worker->reevaluate_polling_on_wakeup = 0;
@@ -273,8 +283,10 @@
       }
       gpr_tls_set(&g_current_thread_poller, (gpr_intptr)pollset);
       gpr_tls_set(&g_current_thread_worker, (gpr_intptr)worker);
+      GPR_TIMER_BEGIN("maybe_work_and_unlock", 0);
       pollset->vtable->maybe_work_and_unlock(exec_ctx, pollset, worker,
                                              deadline, now);
+      GPR_TIMER_END("maybe_work_and_unlock", 0);
       locked = 0;
       gpr_tls_set(&g_current_thread_poller, 0);
       gpr_tls_set(&g_current_thread_worker, 0);
@@ -329,6 +341,7 @@
       gpr_mu_lock(&pollset->mu);
     }
   }
+  GPR_TIMER_END("grpc_pollset_work", 0);
 }
 
 void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
@@ -563,10 +576,11 @@
      even going into the blocking annotation if possible */
   /* poll fd count (argument 2) is shortened by one if we have no events
      to poll on - such that it only includes the kicker */
+  GPR_TIMER_BEGIN("poll", 0);
   GRPC_SCHEDULING_START_BLOCKING_REGION;
   r = grpc_poll_function(pfd, nfds, timeout);
   GRPC_SCHEDULING_END_BLOCKING_REGION;
-  GRPC_TIMER_MARK(GRPC_PTAG_POLL_FINISHED, r);
+  GPR_TIMER_END("poll", 0);
 
   if (r < 0) {
     gpr_log(GPR_ERROR, "poll() failed: %s", strerror(errno));
diff --git a/src/core/iomgr/tcp_posix.c b/src/core/iomgr/tcp_posix.c
index 4a57037..915553d 100644
--- a/src/core/iomgr/tcp_posix.c
+++ b/src/core/iomgr/tcp_posix.c
@@ -180,7 +180,7 @@
   GPR_ASSERT(!tcp->finished_edge);
   GPR_ASSERT(tcp->iov_size <= MAX_READ_IOVEC);
   GPR_ASSERT(tcp->incoming_buffer->count <= MAX_READ_IOVEC);
-  GRPC_TIMER_BEGIN(GRPC_PTAG_HANDLE_READ, 0);
+  GPR_TIMER_BEGIN("tcp_continue_read", 0);
 
   while (tcp->incoming_buffer->count < (size_t)tcp->iov_size) {
     gpr_slice_buffer_add_indexed(tcp->incoming_buffer,
@@ -199,11 +199,11 @@
   msg.msg_controllen = 0;
   msg.msg_flags = 0;
 
-  GRPC_TIMER_BEGIN(GRPC_PTAG_RECVMSG, 0);
+  GPR_TIMER_BEGIN("recvmsg", 1);
   do {
     read_bytes = recvmsg(tcp->fd, &msg, 0);
   } while (read_bytes < 0 && errno == EINTR);
-  GRPC_TIMER_END(GRPC_PTAG_RECVMSG, 0);
+  GPR_TIMER_END("recvmsg", 0);
 
   if (read_bytes < 0) {
     /* NB: After calling call_read_cb a parallel call of the read handler may
@@ -240,7 +240,7 @@
     TCP_UNREF(exec_ctx, tcp, "read");
   }
 
-  GRPC_TIMER_END(GRPC_PTAG_HANDLE_READ, 0);
+  GPR_TIMER_END("tcp_continue_read", 0);
 }
 
 static void tcp_handle_read(grpc_exec_ctx *exec_ctx, void *arg /* grpc_tcp */,
@@ -316,12 +316,12 @@
     msg.msg_controllen = 0;
     msg.msg_flags = 0;
 
-    GRPC_TIMER_BEGIN(GRPC_PTAG_SENDMSG, 0);
+    GPR_TIMER_BEGIN("sendmsg", 1);
     do {
       /* TODO(klempner): Cork if this is a partial write */
       sent_length = sendmsg(tcp->fd, &msg, SENDMSG_FLAGS);
     } while (sent_length < 0 && errno == EINTR);
-    GRPC_TIMER_END(GRPC_PTAG_SENDMSG, 0);
+    GPR_TIMER_END("sendmsg", 0);
 
     if (sent_length < 0) {
       if (errno == EAGAIN) {
@@ -370,17 +370,17 @@
     return;
   }
 
-  GRPC_TIMER_BEGIN(GRPC_PTAG_TCP_CB_WRITE, 0);
   status = tcp_flush(tcp);
   if (status == FLUSH_PENDING) {
     grpc_fd_notify_on_write(exec_ctx, tcp->em_fd, &tcp->write_closure);
   } else {
     cb = tcp->write_cb;
     tcp->write_cb = NULL;
+    GPR_TIMER_BEGIN("tcp_handle_write.cb", 0);
     cb->cb(exec_ctx, cb->cb_arg, status == FLUSH_DONE);
+    GPR_TIMER_END("tcp_handle_write.cb", 0);
     TCP_UNREF(exec_ctx, tcp, "write");
   }
-  GRPC_TIMER_END(GRPC_PTAG_TCP_CB_WRITE, 0);
 }
 
 static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
@@ -399,11 +399,11 @@
     }
   }
 
-  GRPC_TIMER_BEGIN(GRPC_PTAG_TCP_WRITE, 0);
+  GPR_TIMER_BEGIN("tcp_write", 0);
   GPR_ASSERT(tcp->write_cb == NULL);
 
   if (buf->length == 0) {
-    GRPC_TIMER_END(GRPC_PTAG_TCP_WRITE, 0);
+    GPR_TIMER_END("tcp_write", 0);
     grpc_exec_ctx_enqueue(exec_ctx, cb, 1);
     return;
   }
@@ -420,7 +420,7 @@
     grpc_exec_ctx_enqueue(exec_ctx, cb, status == FLUSH_DONE);
   }
 
-  GRPC_TIMER_END(GRPC_PTAG_TCP_WRITE, 0);
+  GPR_TIMER_END("tcp_write", 0);
 }
 
 static void tcp_add_to_pollset(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
diff --git a/src/core/iomgr/wakeup_fd_eventfd.c b/src/core/iomgr/wakeup_fd_eventfd.c
index 48eb1af..f67379e 100644
--- a/src/core/iomgr/wakeup_fd_eventfd.c
+++ b/src/core/iomgr/wakeup_fd_eventfd.c
@@ -39,9 +39,11 @@
 #include <sys/eventfd.h>
 #include <unistd.h>
 
-#include "src/core/iomgr/wakeup_fd_posix.h"
 #include <grpc/support/log.h>
 
+#include "src/core/iomgr/wakeup_fd_posix.h"
+#include "src/core/profiling/timers.h"
+
 static void eventfd_create(grpc_wakeup_fd* fd_info) {
   int efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
   /* TODO(klempner): Handle failure more gracefully */
@@ -60,9 +62,11 @@
 
 static void eventfd_wakeup(grpc_wakeup_fd* fd_info) {
   int err;
+  GPR_TIMER_BEGIN("eventfd_wakeup", 0);
   do {
     err = eventfd_write(fd_info->read_fd, 1);
   } while (err < 0 && errno == EINTR);
+  GPR_TIMER_END("eventfd_wakeup", 0);
 }
 
 static void eventfd_destroy(grpc_wakeup_fd* fd_info) {
diff --git a/src/core/profiling/basic_timers.c b/src/core/profiling/basic_timers.c
index 2f6c88d..b49cdd0 100644
--- a/src/core/profiling/basic_timers.c
+++ b/src/core/profiling/basic_timers.c
@@ -44,98 +44,91 @@
 #include <grpc/support/thd.h>
 #include <stdio.h>
 
-typedef enum {
-  BEGIN = '{',
-  END = '}',
-  MARK = '.',
-  IMPORTANT = '!'
-} marker_type;
+typedef enum { BEGIN = '{', END = '}', MARK = '.' } marker_type;
 
-typedef struct grpc_timer_entry {
+typedef struct gpr_timer_entry {
   gpr_timespec tm;
-  int tag;
   const char *tagstr;
-  marker_type type;
-  void *id;
   const char *file;
   int line;
-} grpc_timer_entry;
+  char type;
+  gpr_uint8 important;
+} gpr_timer_entry;
 
-#define MAX_COUNT (1024 * 1024 / sizeof(grpc_timer_entry))
+#define MAX_COUNT (1024 * 1024 / sizeof(gpr_timer_entry))
 
-static __thread grpc_timer_entry log[MAX_COUNT];
-static __thread int count;
+static __thread gpr_timer_entry g_log[MAX_COUNT];
+static __thread int g_count;
+static gpr_once g_once_init = GPR_ONCE_INIT;
+static FILE *output_file;
+
+static void close_output() { fclose(output_file); }
+
+static void init_output() {
+  output_file = fopen("latency_trace.txt", "w");
+  GPR_ASSERT(output_file);
+  atexit(close_output);
+}
 
 static void log_report() {
   int i;
-  for (i = 0; i < count; i++) {
-    grpc_timer_entry *entry = &(log[i]);
-    printf("GRPC_LAT_PROF %ld.%09d  %p %c %d(%s) %p %s %d\n", entry->tm.tv_sec,
-           entry->tm.tv_nsec, (void *)(gpr_intptr)gpr_thd_currentid(),
-           entry->type, entry->tag, entry->tagstr, entry->id, entry->file,
-           entry->line);
+  gpr_once_init(&g_once_init, init_output);
+  for (i = 0; i < g_count; i++) {
+    gpr_timer_entry *entry = &(g_log[i]);
+    fprintf(output_file,
+            "{\"t\": %ld.%09d, \"thd\": \"%p\", \"type\": \"%c\", \"tag\": "
+            "\"%s\", \"file\": \"%s\", \"line\": %d, \"imp\": %d}\n",
+            entry->tm.tv_sec, entry->tm.tv_nsec,
+            (void *)(gpr_intptr)gpr_thd_currentid(), entry->type, entry->tagstr,
+            entry->file, entry->line, entry->important);
   }
 
   /* Now clear out the log */
-  count = 0;
+  g_count = 0;
 }
 
-static void grpc_timers_log_add(int tag, const char *tagstr, marker_type type,
-                                void *id, const char *file, int line) {
-  grpc_timer_entry *entry;
+static void gpr_timers_log_add(const char *tagstr, marker_type type,
+                               int important, const char *file, int line) {
+  gpr_timer_entry *entry;
 
   /* TODO (vpai) : Improve concurrency */
-  if (count == MAX_COUNT) {
+  if (g_count == MAX_COUNT) {
     log_report();
   }
 
-  entry = &log[count++];
+  entry = &g_log[g_count++];
 
   entry->tm = gpr_now(GPR_CLOCK_PRECISE);
-  entry->tag = tag;
   entry->tagstr = tagstr;
   entry->type = type;
-  entry->id = id;
   entry->file = file;
   entry->line = line;
+  entry->important = important != 0;
 }
 
 /* Latency profiler API implementation. */
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, MARK, id, file, line);
-  }
+void gpr_timer_add_mark(const char *tagstr, int important, const char *file,
+                        int line) {
+  gpr_timers_log_add(tagstr, MARK, important, file, line);
 }
 
-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, IMPORTANT, id, file, line);
-  }
+void gpr_timer_begin(const char *tagstr, int important, const char *file,
+                     int line) {
+  gpr_timers_log_add(tagstr, BEGIN, important, file, line);
 }
 
-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, BEGIN, id, file, line);
-  }
-}
-
-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, END, id, file, line);
-  }
+void gpr_timer_end(const char *tagstr, int important, const char *file,
+                   int line) {
+  gpr_timers_log_add(tagstr, END, important, file, line);
 }
 
 /* Basic profiler specific API functions. */
-void grpc_timers_global_init(void) {}
+void gpr_timers_global_init(void) {}
 
-void grpc_timers_global_destroy(void) {}
+void gpr_timers_global_destroy(void) {}
 
 #else  /* !GRPC_BASIC_PROFILER */
-void grpc_timers_global_init(void) {}
+void gpr_timers_global_init(void) {}
 
-void grpc_timers_global_destroy(void) {}
+void gpr_timers_global_destroy(void) {}
 #endif /* GRPC_BASIC_PROFILER */
diff --git a/src/core/profiling/stap_timers.c b/src/core/profiling/stap_timers.c
index 6868a67..efcd1af 100644
--- a/src/core/profiling/stap_timers.c
+++ b/src/core/profiling/stap_timers.c
@@ -42,23 +42,23 @@
 #include "src/core/profiling/stap_probes.h"
 
 /* Latency profiler API implementation. */
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line) {
+void gpr_timer_add_mark(int tag, const char *tagstr, void *id, const char *file,
+                        int line) {
   _STAP_ADD_MARK(tag);
 }
 
-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line) {
+void gpr_timer_add_important_mark(int tag, const char *tagstr, void *id,
+                                  const char *file, int line) {
   _STAP_ADD_IMPORTANT_MARK(tag);
 }
 
-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line) {
+void gpr_timer_begin(int tag, const char *tagstr, void *id, const char *file,
+                     int line) {
   _STAP_TIMING_NS_BEGIN(tag);
 }
 
-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line) {
+void gpr_timer_end(int tag, const char *tagstr, void *id, const char *file,
+                   int line) {
   _STAP_TIMING_NS_END(tag);
 }
 
diff --git a/src/core/profiling/timers.h b/src/core/profiling/timers.h
index a705204..0d112e7 100644
--- a/src/core/profiling/timers.h
+++ b/src/core/profiling/timers.h
@@ -38,65 +38,28 @@
 extern "C" {
 #endif
 
-void grpc_timers_global_init(void);
-void grpc_timers_global_destroy(void);
+void gpr_timers_global_init(void);
+void gpr_timers_global_destroy(void);
 
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line);
-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line);
-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line);
-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line);
-
-enum grpc_profiling_tags {
-  /* Any GRPC_PTAG_* >= than the threshold won't generate any profiling mark. */
-  GRPC_PTAG_IGNORE_THRESHOLD = 1000000,
-
-  /* Re. Protos. */
-  GRPC_PTAG_PROTO_SERIALIZE = 100 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_PROTO_DESERIALIZE = 101 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* Re. sockets. */
-  GRPC_PTAG_HANDLE_READ = 200 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_SENDMSG = 201 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_RECVMSG = 202 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_POLL_FINISHED = 203 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_TCP_CB_WRITE = 204 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_TCP_WRITE = 205 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_CALL_ON_DONE_RECV = 206 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* C++ */
-  GRPC_PTAG_CPP_CALL_CREATED = 300 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_CPP_PERFORM_OPS = 301 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* Transports */
-  GRPC_PTAG_HTTP2_UNLOCK = 401 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_HTTP2_UNLOCK_CLEANUP = 402 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* > 1024 Unassigned reserved. For any miscellaneous use.
-   * Use addition to generate tags from this base or take advantage of the 10
-   * zero'd bits for OR-ing. */
-  GRPC_PTAG_OTHER_BASE = 1024
-};
+void gpr_timer_add_mark(const char *tagstr, int important, const char *file,
+                        int line);
+void gpr_timer_begin(const char *tagstr, int important, const char *file,
+                     int line);
+void gpr_timer_end(const char *tagstr, int important, const char *file,
+                   int line);
 
 #if !(defined(GRPC_STAP_PROFILER) + defined(GRPC_BASIC_PROFILER))
 /* No profiling. No-op all the things. */
-#define GRPC_TIMER_MARK(tag, id) \
-  do {                           \
+#define GPR_TIMER_MARK(tag, important) \
+  do {                                 \
   } while (0)
 
-#define GRPC_TIMER_IMPORTANT_MARK(tag, id) \
-  do {                                     \
+#define GPR_TIMER_BEGIN(tag, important) \
+  do {                                  \
   } while (0)
 
-#define GRPC_TIMER_BEGIN(tag, id) \
-  do {                            \
-  } while (0)
-
-#define GRPC_TIMER_END(tag, id) \
-  do {                          \
+#define GPR_TIMER_END(tag, important) \
+  do {                                \
   } while (0)
 
 #else /* at least one profiler requested... */
@@ -106,28 +69,14 @@
 #endif
 
 /* Generic profiling interface. */
-#define GRPC_TIMER_MARK(tag, id)                                         \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                \
-    grpc_timer_add_mark(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, \
-                        __LINE__);                                       \
-  }
+#define GPR_TIMER_MARK(tag, important) \
+  gpr_timer_add_mark(tag, important, __FILE__, __LINE__);
 
-#define GRPC_TIMER_IMPORTANT_MARK(tag, id)                               \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                \
-    grpc_timer_add_important_mark(tag, #tag, ((void *)(gpr_intptr)(id)), \
-                                  __FILE__, __LINE__);                   \
-  }
+#define GPR_TIMER_BEGIN(tag, important) \
+  gpr_timer_begin(tag, important, __FILE__, __LINE__);
 
-#define GRPC_TIMER_BEGIN(tag, id)                                     \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                             \
-    grpc_timer_begin(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, \
-                     __LINE__);                                       \
-  }
-
-#define GRPC_TIMER_END(tag, id)                                                \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                      \
-    grpc_timer_end(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__); \
-  }
+#define GPR_TIMER_END(tag, important) \
+  gpr_timer_end(tag, important, __FILE__, __LINE__);
 
 #ifdef GRPC_STAP_PROFILER
 /* Empty placeholder for now. */
@@ -141,6 +90,28 @@
 
 #ifdef __cplusplus
 }
+
+#if (defined(GRPC_STAP_PROFILER) + defined(GRPC_BASIC_PROFILER))
+namespace grpc {
+class ProfileScope {
+ public:
+  ProfileScope(const char *desc, bool important) : desc_(desc) {
+    GPR_TIMER_BEGIN(desc_, important ? 1 : 0);
+  }
+  ~ProfileScope() { GPR_TIMER_END(desc_, 0); }
+
+ private:
+  const char *const desc_;
+};
+}
+
+#define GPR_TIMER_SCOPE(tag, important) \
+  ::grpc::ProfileScope _profile_scope_##__LINE__((tag), (important))
+#else
+#define GPR_TIMER_SCOPE(tag, important) \
+  do {                                  \
+  } while (false)
+#endif
 #endif
 
 #endif /* GRPC_CORE_PROFILING_TIMERS_H */
diff --git a/src/core/support/alloc.c b/src/core/support/alloc.c
index d2ed82e..bfcb779 100644
--- a/src/core/support/alloc.c
+++ b/src/core/support/alloc.c
@@ -35,22 +35,32 @@
 
 #include <stdlib.h>
 #include <grpc/support/port_platform.h>
+#include "src/core/profiling/timers.h"
 
 void *gpr_malloc(size_t size) {
-  void *p = malloc(size);
+  void *p;
+  GPR_TIMER_BEGIN("gpr_malloc", 0);
+  p = malloc(size);
   if (!p) {
     abort();
   }
+  GPR_TIMER_END("gpr_malloc", 0);
   return p;
 }
 
-void gpr_free(void *p) { free(p); }
+void gpr_free(void *p) {
+  GPR_TIMER_BEGIN("gpr_free", 0);
+  free(p);
+  GPR_TIMER_END("gpr_free", 0);
+}
 
 void *gpr_realloc(void *p, size_t size) {
+  GPR_TIMER_BEGIN("gpr_realloc", 0);
   p = realloc(p, size);
   if (!p) {
     abort();
   }
+  GPR_TIMER_END("gpr_realloc", 0);
   return p;
 }
 
diff --git a/src/core/support/sync_posix.c b/src/core/support/sync_posix.c
index 91c3098..39c96fe 100644
--- a/src/core/support/sync_posix.c
+++ b/src/core/support/sync_posix.c
@@ -40,14 +40,23 @@
 #include <grpc/support/log.h>
 #include <grpc/support/sync.h>
 #include <grpc/support/time.h>
+#include "src/core/profiling/timers.h"
 
 void gpr_mu_init(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_init(mu, NULL) == 0); }
 
 void gpr_mu_destroy(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_destroy(mu) == 0); }
 
-void gpr_mu_lock(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_lock(mu) == 0); }
+void gpr_mu_lock(gpr_mu* mu) {
+  GPR_TIMER_BEGIN("gpr_mu_lock", 0);
+  GPR_ASSERT(pthread_mutex_lock(mu) == 0);
+  GPR_TIMER_END("gpr_mu_lock", 0);
+}
 
-void gpr_mu_unlock(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_unlock(mu) == 0); }
+void gpr_mu_unlock(gpr_mu* mu) {
+  GPR_TIMER_BEGIN("gpr_mu_unlock", 0);
+  GPR_ASSERT(pthread_mutex_unlock(mu) == 0);
+  GPR_TIMER_END("gpr_mu_unlock", 0);
+}
 
 int gpr_mu_trylock(gpr_mu* mu) {
   int err = pthread_mutex_trylock(mu);
diff --git a/src/core/support/time_posix.c b/src/core/support/time_posix.c
index 78f2c2b..02cfca8 100644
--- a/src/core/support/time_posix.c
+++ b/src/core/support/time_posix.c
@@ -63,7 +63,7 @@
 /** maps gpr_clock_type --> clockid_t for clock_gettime */
 static clockid_t clockid_for_gpr_clock[] = {CLOCK_MONOTONIC, CLOCK_REALTIME};
 
-void gpr_time_init(void) {}
+void gpr_time_init(void) { gpr_precise_clock_init(); }
 
 gpr_timespec gpr_now(gpr_clock_type clock_type) {
   struct timespec now;
@@ -89,6 +89,7 @@
 
 void gpr_time_init(void) {
   mach_timebase_info_data_t tb = {0, 1};
+  gpr_precise_clock_init();
   mach_timebase_info(&tb);
   g_time_scale = tb.numer;
   g_time_scale /= tb.denom;
diff --git a/src/core/support/time_precise.c b/src/core/support/time_precise.c
new file mode 100644
index 0000000..b37517e
--- /dev/null
+++ b/src/core/support/time_precise.c
@@ -0,0 +1,89 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <grpc/support/log.h>
+#include <grpc/support/time.h>
+#include <stdio.h>
+
+#ifdef GRPC_TIMERS_RDTSC
+#if defined(__i386__)
+static void gpr_get_cycle_counter(long long int *clk) {
+  long long int ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  *clk = ret;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__x86_64__) || defined(__amd64__)
+static void gpr_get_cycle_counter(long long int *clk) {
+  unsigned long long low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  *clk = (long long)(high << 32) | (long long)low;
+}
+#endif
+
+static double cycles_per_second = 0;
+static long long int start_cycle;
+void gpr_precise_clock_init(void) {
+  time_t start;
+  long long end_cycle;
+  gpr_log(GPR_DEBUG, "Calibrating timers");
+  start = time(NULL);
+  while (time(NULL) == start)
+    ;
+  gpr_get_cycle_counter(&start_cycle);
+  while (time(NULL) <= start + 10)
+    ;
+  gpr_get_cycle_counter(&end_cycle);
+  cycles_per_second = (double)(end_cycle - start_cycle) / 10.0;
+  gpr_log(GPR_DEBUG, "... cycles_per_second = %f\n", cycles_per_second);
+}
+
+void gpr_precise_clock_now(gpr_timespec *clk) {
+  long long int counter;
+  double secs;
+  gpr_get_cycle_counter(&counter);
+  secs = (double)(counter - start_cycle) / cycles_per_second;
+  clk->clock_type = GPR_CLOCK_PRECISE;
+  clk->tv_sec = (time_t)secs;
+  clk->tv_nsec = (int)(1e9 * (secs - (double)clk->tv_sec));
+}
+
+#else  /* GRPC_TIMERS_RDTSC */
+void gpr_precise_clock_init(void) {}
+
+void gpr_precise_clock_now(gpr_timespec *clk) {
+  *clk = gpr_now(GPR_CLOCK_REALTIME);
+  clk->clock_type = GPR_CLOCK_PRECISE;
+}
+#endif /* GRPC_TIMERS_RDTSC */
diff --git a/src/core/support/time_precise.h b/src/core/support/time_precise.h
index cd201fa..80c5000 100644
--- a/src/core/support/time_precise.h
+++ b/src/core/support/time_precise.h
@@ -34,60 +34,9 @@
 #ifndef GRPC_CORE_SUPPORT_TIME_PRECISE_H_
 #define GRPC_CORE_SUPPORT_TIME_PRECISE_H_
 
-#include <grpc/support/sync.h>
 #include <grpc/support/time.h>
-#include <stdio.h>
 
-#ifdef GRPC_TIMERS_RDTSC
-#if defined(__i386__)
-static void gpr_get_cycle_counter(long long int *clk) {
-  long long int ret;
-  __asm__ volatile("rdtsc" : "=A"(ret));
-  *clk = ret;
-}
-
-// ----------------------------------------------------------------
-#elif defined(__x86_64__) || defined(__amd64__)
-static void gpr_get_cycle_counter(long long int *clk) {
-  unsigned long long low, high;
-  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  *clk = (high << 32) | low;
-}
-#endif
-
-static gpr_once precise_clock_init = GPR_ONCE_INIT;
-static long long cycles_per_second = 0;
-static void gpr_precise_clock_init() {
-  time_t start = time(NULL);
-  gpr_precise_clock start_cycle;
-  gpr_precise_clock end_cycle;
-  while (time(NULL) == start)
-    ;
-  gpr_get_cycle_counter(&start_cycle);
-  while (time(NULL) == start + 1)
-    ;
-  gpr_get_cycle_counter(&end_cycle);
-  cycles_per_second = end_cycle - start_cycle;
-}
-
-static double grpc_precise_clock_scaling_factor() {
-  gpr_once_init(&precise_clock_init, grpc_precise_clock_init);
-  return 1e6 / cycles_per_second;
-}
-
-static void gpr_precise_clock_now(gpr_timespec *clk) {
-  long long int counter;
-  gpr_get_cycle_counter(&counter);
-  clk->clock = GPR_CLOCK_REALTIME;
-  clk->tv_sec = counter / cycles_per_second;
-  clk->tv_nsec = counter % cycles_per_second;
-}
-
-#else  /* GRPC_TIMERS_RDTSC */
-static void gpr_precise_clock_now(gpr_timespec *clk) {
-  *clk = gpr_now(GPR_CLOCK_REALTIME);
-  clk->clock_type = GPR_CLOCK_PRECISE;
-}
-#endif /* GRPC_TIMERS_RDTSC */
+void gpr_precise_clock_init(void);
+void gpr_precise_clock_now(gpr_timespec *clk);
 
 #endif /* GRPC_CORE_SUPPORT_TIME_PRECISE_ */
diff --git a/src/core/surface/call.c b/src/core/surface/call.c
index 386a4a6..b40e74d 100644
--- a/src/core/surface/call.c
+++ b/src/core/surface/call.c
@@ -306,8 +306,9 @@
   grpc_transport_stream_op *initial_op_ptr = NULL;
   grpc_channel_stack *channel_stack = grpc_channel_get_channel_stack(channel);
   grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
-  grpc_call *call =
-      gpr_malloc(sizeof(grpc_call) + channel_stack->call_stack_size);
+  grpc_call *call;
+  GPR_TIMER_BEGIN("grpc_call_create", 0);
+  call = gpr_malloc(sizeof(grpc_call) + channel_stack->call_stack_size);
   memset(call, 0, sizeof(grpc_call));
   gpr_mu_init(&call->mu);
   gpr_mu_init(&call->completion_mu);
@@ -401,6 +402,7 @@
     set_deadline_alarm(&exec_ctx, call, send_deadline);
   }
   grpc_exec_ctx_finish(&exec_ctx);
+  GPR_TIMER_END("grpc_call_create", 0);
   return call;
 }
 
@@ -461,6 +463,7 @@
 static void destroy_call(grpc_exec_ctx *exec_ctx, grpc_call *call) {
   size_t i;
   grpc_call *c = call;
+  GPR_TIMER_BEGIN("destroy_call", 0);
   grpc_call_stack_destroy(exec_ctx, CALL_STACK_FROM_CALL(c));
   GRPC_CHANNEL_INTERNAL_UNREF(exec_ctx, c->channel, "call");
   gpr_mu_destroy(&c->mu);
@@ -493,6 +496,7 @@
     GRPC_CQ_INTERNAL_UNREF(c->cq, "bind");
   }
   gpr_free(c);
+  GPR_TIMER_END("destroy_call", 0);
 }
 
 #ifdef GRPC_CALL_REF_COUNT_DEBUG
@@ -624,6 +628,8 @@
   const size_t MAX_RECV_PEEK_AHEAD = 65536;
   size_t buffered_bytes;
 
+  GPR_TIMER_BEGIN("unlock", 0);
+
   memset(&op, 0, sizeof(op));
 
   op.cancel_with_status = call->cancel_with_status;
@@ -694,6 +700,8 @@
     unlock(exec_ctx, call);
     GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "completing");
   }
+
+  GPR_TIMER_END("unlock", 0);
 }
 
 static void get_final_status(grpc_call *call, grpc_ioreq_data out) {
@@ -843,6 +851,7 @@
 
 static void call_on_done_send(grpc_exec_ctx *exec_ctx, void *pc, int success) {
   grpc_call *call = pc;
+  GPR_TIMER_BEGIN("call_on_done_send", 0);
   lock(call);
   if (call->last_send_contains & (1 << GRPC_IOREQ_SEND_INITIAL_METADATA)) {
     finish_ioreq_op(call, GRPC_IOREQ_SEND_INITIAL_METADATA, success);
@@ -866,9 +875,11 @@
   call->sending = 0;
   unlock(exec_ctx, call);
   GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "sending");
+  GPR_TIMER_END("call_on_done_send", 0);
 }
 
 static void finish_message(grpc_call *call) {
+  GPR_TIMER_BEGIN("finish_message", 0);
   if (call->error_status_set == 0) {
     /* TODO(ctiller): this could be a lot faster if coded directly */
     grpc_byte_buffer *byte_buffer;
@@ -888,6 +899,7 @@
   gpr_slice_buffer_reset_and_unref(&call->incoming_message);
   GPR_ASSERT(call->incoming_message.count == 0);
   call->reading_message = 0;
+  GPR_TIMER_END("finish_message", 0);
 }
 
 static int begin_message(grpc_call *call, grpc_begin_message msg) {
@@ -977,7 +989,7 @@
   grpc_call *child_call;
   grpc_call *next_child_call;
   size_t i;
-  GRPC_TIMER_BEGIN(GRPC_PTAG_CALL_ON_DONE_RECV, 0);
+  GPR_TIMER_BEGIN("call_on_done_recv", 0);
   lock(call);
   call->receiving = 0;
   if (success) {
@@ -987,13 +999,19 @@
         case GRPC_NO_OP:
           break;
         case GRPC_OP_METADATA:
+          GPR_TIMER_BEGIN("recv_metadata", 0);
           recv_metadata(exec_ctx, call, &op->data.metadata);
+          GPR_TIMER_END("recv_metadata", 0);
           break;
         case GRPC_OP_BEGIN_MESSAGE:
+          GPR_TIMER_BEGIN("begin_message", 0);
           success = begin_message(call, op->data.begin_message);
+          GPR_TIMER_END("begin_message", 0);
           break;
         case GRPC_OP_SLICE:
+          GPR_TIMER_BEGIN("add_slice_to_message", 0);
           success = add_slice_to_message(call, op->data.slice);
+          GPR_TIMER_END("add_slice_to_message", 0);
           break;
       }
     }
@@ -1039,7 +1057,7 @@
   unlock(exec_ctx, call);
 
   GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "receiving");
-  GRPC_TIMER_END(GRPC_PTAG_CALL_ON_DONE_RECV, 0);
+  GPR_TIMER_END("call_on_done_recv", 0);
 }
 
 static int prepare_application_metadata(grpc_call *call, size_t count,
@@ -1517,16 +1535,25 @@
     grpc_mdelem *mdel = l->md;
     grpc_mdstr *key = mdel->key;
     if (key == grpc_channel_get_status_string(call->channel)) {
+      GPR_TIMER_BEGIN("status", 0);
       set_status_code(call, STATUS_FROM_WIRE, decode_status(mdel));
+      GPR_TIMER_END("status", 0);
     } else if (key == grpc_channel_get_message_string(call->channel)) {
+      GPR_TIMER_BEGIN("status-details", 0);
       set_status_details(call, STATUS_FROM_WIRE, GRPC_MDSTR_REF(mdel->value));
+      GPR_TIMER_END("status-details", 0);
     } else if (key ==
                grpc_channel_get_compression_algorithm_string(call->channel)) {
+      GPR_TIMER_BEGIN("compression_algorithm", 0);
       set_compression_algorithm(call, decode_compression(mdel));
+      GPR_TIMER_END("compression_algorithm", 0);
     } else if (key == grpc_channel_get_encodings_accepted_by_peer_string(
                           call->channel)) {
+      GPR_TIMER_BEGIN("encodings_accepted_by_peer", 0);
       set_encodings_accepted_by_peer(call, mdel->value->slice);
+      GPR_TIMER_END("encodings_accepted_by_peer", 0);
     } else {
+      GPR_TIMER_BEGIN("report_up", 0);
       dest = &call->buffered_metadata[is_trailing];
       if (dest->count == dest->capacity) {
         dest->capacity = GPR_MAX(dest->capacity + 8, dest->capacity * 2);
@@ -1547,12 +1574,15 @@
       }
       call->owned_metadata[call->owned_metadata_count++] = mdel;
       l->md = NULL;
+      GPR_TIMER_END("report_up", 0);
     }
   }
   if (gpr_time_cmp(md->deadline, gpr_inf_future(md->deadline.clock_type)) !=
           0 &&
       !call->is_client) {
+    GPR_TIMER_BEGIN("set_deadline_alarm", 0);
     set_deadline_alarm(exec_ctx, call, md->deadline);
+    GPR_TIMER_END("set_deadline_alarm", 0);
   }
   if (!is_trailing) {
     call->read_state = READ_STATE_GOT_INITIAL_METADATA;
@@ -1615,6 +1645,8 @@
   grpc_call_error error;
   grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
 
+  GPR_TIMER_BEGIN("grpc_call_start_batch", 0);
+
   GRPC_API_TRACE(
       "grpc_call_start_batch(call=%p, ops=%p, nops=%lu, tag=%p, reserved=%p)",
       5, (call, ops, (unsigned long)nops, tag, reserved));
@@ -1852,6 +1884,7 @@
                                               finish_func, tag);
 done:
   grpc_exec_ctx_finish(&exec_ctx);
+  GPR_TIMER_END("grpc_call_start_batch", 0);
   return error;
 }
 
diff --git a/src/core/surface/completion_queue.c b/src/core/surface/completion_queue.c
index 9e552c2..bcdb363 100644
--- a/src/core/surface/completion_queue.c
+++ b/src/core/surface/completion_queue.c
@@ -42,6 +42,7 @@
 #include "src/core/surface/call.h"
 #include "src/core/surface/event_string.h"
 #include "src/core/surface/surface_trace.h"
+#include "src/core/profiling/timers.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/atm.h>
 #include <grpc/support/log.h>
@@ -143,6 +144,8 @@
   int i;
   grpc_pollset_worker *pluck_worker;
 
+  GPR_TIMER_BEGIN("grpc_cq_end_op", 0);
+
   storage->tag = tag;
   storage->done = done;
   storage->done_arg = done_arg;
@@ -174,6 +177,8 @@
     gpr_mu_unlock(GRPC_POLLSET_MU(&cc->pollset));
     grpc_pollset_shutdown(exec_ctx, &cc->pollset, &cc->pollset_destroy_done);
   }
+
+  GPR_TIMER_END("grpc_cq_end_op", 0);
 }
 
 grpc_event grpc_completion_queue_next(grpc_completion_queue *cc,
@@ -184,6 +189,8 @@
   gpr_timespec now;
   grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
 
+  GPR_TIMER_BEGIN("grpc_completion_queue_next", 0);
+
   GRPC_API_TRACE(
       "grpc_completion_queue_next("
       "cc=%p, "
@@ -230,6 +237,9 @@
   GRPC_SURFACE_TRACE_RETURNED_EVENT(cc, &ret);
   GRPC_CQ_INTERNAL_UNREF(cc, "next");
   grpc_exec_ctx_finish(&exec_ctx);
+
+  GPR_TIMER_END("grpc_completion_queue_next", 0);
+
   return ret;
 }
 
@@ -267,6 +277,8 @@
   int first_loop = 1;
   grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
 
+  GPR_TIMER_BEGIN("grpc_completion_queue_pluck", 0);
+
   GRPC_API_TRACE(
       "grpc_completion_queue_pluck("
       "cc=%p, tag=%p, "
@@ -332,6 +344,9 @@
   GRPC_SURFACE_TRACE_RETURNED_EVENT(cc, &ret);
   GRPC_CQ_INTERNAL_UNREF(cc, "pluck");
   grpc_exec_ctx_finish(&exec_ctx);
+
+  GPR_TIMER_END("grpc_completion_queue_pluck", 0);
+
   return ret;
 }
 
diff --git a/src/core/surface/init.c b/src/core/surface/init.c
index 95011ca..715c90a 100644
--- a/src/core/surface/init.c
+++ b/src/core/surface/init.c
@@ -115,7 +115,7 @@
         gpr_log(GPR_ERROR, "Could not initialize census.");
       }
     }
-    grpc_timers_global_init();
+    gpr_timers_global_init();
     for (i = 0; i < g_number_of_plugins; i++) {
       if (g_all_of_the_plugins[i].init != NULL) {
         g_all_of_the_plugins[i].init();
@@ -133,7 +133,7 @@
   if (--g_initializations == 0) {
     grpc_iomgr_shutdown();
     census_shutdown();
-    grpc_timers_global_destroy();
+    gpr_timers_global_destroy();
     grpc_tracer_shutdown();
     grpc_resolver_registry_shutdown();
     for (i = 0; i < g_number_of_plugins; i++) {
diff --git a/src/core/transport/chttp2/parsing.c b/src/core/transport/chttp2/parsing.c
index a0977cc..5d4d8e7 100644
--- a/src/core/transport/chttp2/parsing.c
+++ b/src/core/transport/chttp2/parsing.c
@@ -35,6 +35,7 @@
 
 #include <string.h>
 
+#include "src/core/profiling/timers.h"
 #include "src/core/transport/chttp2/http2_errors.h"
 #include "src/core/transport/chttp2/status_conversion.h"
 #include "src/core/transport/chttp2/timeout_encoding.h"
@@ -68,6 +69,8 @@
   grpc_chttp2_stream_global *stream_global;
   grpc_chttp2_stream_parsing *stream_parsing;
 
+  GPR_TIMER_BEGIN("grpc_chttp2_prepare_to_read", 0);
+
   transport_parsing->next_stream_id = transport_global->next_stream_id;
 
   /* update the parsing view of incoming window */
@@ -89,6 +92,8 @@
       stream_parsing->incoming_window = stream_global->incoming_window;
     }
   }
+
+  GPR_TIMER_END("grpc_chttp2_prepare_to_read", 0);
 }
 
 void grpc_chttp2_publish_reads(
diff --git a/src/core/transport/chttp2/writing.c b/src/core/transport/chttp2/writing.c
index d1c9da6..69ad885 100644
--- a/src/core/transport/chttp2/writing.c
+++ b/src/core/transport/chttp2/writing.c
@@ -37,6 +37,7 @@
 
 #include <grpc/support/log.h>
 
+#include "src/core/profiling/timers.h"
 #include "src/core/transport/chttp2/http2_errors.h"
 
 static void finalize_outbuf(grpc_chttp2_transport_writing *transport_writing);
@@ -180,6 +181,8 @@
 static void finalize_outbuf(grpc_chttp2_transport_writing *transport_writing) {
   grpc_chttp2_stream_writing *stream_writing;
 
+  GPR_TIMER_BEGIN("finalize_outbuf", 0);
+
   while (
       grpc_chttp2_list_pop_writing_stream(transport_writing, &stream_writing)) {
     if (stream_writing->sopb.nops > 0 ||
@@ -208,6 +211,8 @@
     }
     grpc_chttp2_list_add_written_stream(transport_writing, stream_writing);
   }
+
+  GPR_TIMER_END("finalize_outbuf", 0);
 }
 
 void grpc_chttp2_cleanup_writing(
diff --git a/src/core/transport/chttp2_transport.c b/src/core/transport/chttp2_transport.c
index de74379..effc3c4 100644
--- a/src/core/transport/chttp2_transport.c
+++ b/src/core/transport/chttp2_transport.c
@@ -510,6 +510,7 @@
 static void lock(grpc_chttp2_transport *t) { gpr_mu_lock(&t->mu); }
 
 static void unlock(grpc_exec_ctx *exec_ctx, grpc_chttp2_transport *t) {
+  GPR_TIMER_BEGIN("unlock", 0);
   unlock_check_read_write_state(exec_ctx, t);
   if (!t->writing_active && !t->closed &&
       grpc_chttp2_unlocking_check_writes(&t->global, &t->writing)) {
@@ -520,6 +521,7 @@
   }
 
   gpr_mu_unlock(&t->mu);
+  GPR_TIMER_END("unlock", 0);
 }
 
 /*
@@ -546,6 +548,8 @@
   grpc_chttp2_transport_writing *transport_writing = transport_writing_ptr;
   grpc_chttp2_transport *t = TRANSPORT_FROM_WRITING(transport_writing);
 
+  GPR_TIMER_BEGIN("grpc_chttp2_terminate_writing", 0);
+
   lock(t);
 
   allow_endpoint_shutdown_locked(exec_ctx, t);
@@ -567,12 +571,16 @@
   unlock(exec_ctx, t);
 
   UNREF_TRANSPORT(exec_ctx, t, "writing");
+
+  GPR_TIMER_END("grpc_chttp2_terminate_writing", 0);
 }
 
 static void writing_action(grpc_exec_ctx *exec_ctx, void *gt,
                            int iomgr_success_ignored) {
   grpc_chttp2_transport *t = gt;
+  GPR_TIMER_BEGIN("writing_action", 0);
   grpc_chttp2_perform_writes(exec_ctx, &t->writing, t->ep);
+  GPR_TIMER_END("writing_action", 0);
 }
 
 void grpc_chttp2_add_incoming_goaway(
@@ -642,6 +650,7 @@
 static void perform_stream_op_locked(
     grpc_exec_ctx *exec_ctx, grpc_chttp2_transport_global *transport_global,
     grpc_chttp2_stream_global *stream_global, grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("perform_stream_op_locked", 0);
   if (op->cancel_with_status != GRPC_STATUS_OK) {
     cancel_from_api(transport_global, stream_global, op->cancel_with_status);
   }
@@ -713,6 +722,7 @@
   }
 
   grpc_exec_ctx_enqueue(exec_ctx, op->on_consumed, 1);
+  GPR_TIMER_END("perform_stream_op_locked", 0);
 }
 
 static void perform_stream_op(grpc_exec_ctx *exec_ctx, grpc_transport *gt,
@@ -1103,6 +1113,8 @@
   int keep_reading = 0;
   grpc_chttp2_transport *t = tp;
 
+  GPR_TIMER_BEGIN("recv_data", 0);
+
   lock(t);
   i = 0;
   GPR_ASSERT(!t->parsing_active);
@@ -1113,11 +1125,13 @@
                                      &t->parsing_stream_map);
     grpc_chttp2_prepare_to_read(&t->global, &t->parsing);
     gpr_mu_unlock(&t->mu);
+    GPR_TIMER_BEGIN("recv_data.parse", 0);
     for (; i < t->read_buffer.count &&
                grpc_chttp2_perform_read(exec_ctx, &t->parsing,
                                         t->read_buffer.slices[i]);
          i++)
       ;
+    GPR_TIMER_END("recv_data.parse", 0);
     gpr_mu_lock(&t->mu);
     if (i != t->read_buffer.count) {
       drop_connection(exec_ctx, t);
@@ -1154,6 +1168,8 @@
   } else {
     UNREF_TRANSPORT(exec_ctx, t, "recv_data");
   }
+
+  GPR_TIMER_END("recv_data", 0);
 }
 
 /*
diff --git a/src/core/transport/stream_op.c b/src/core/transport/stream_op.c
index 1cb2bd7..6493e77 100644
--- a/src/core/transport/stream_op.c
+++ b/src/core/transport/stream_op.c
@@ -38,6 +38,8 @@
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
 
+#include "src/core/profiling/timers.h"
+
 /* Exponential growth function: Given x, return a larger x.
    Currently we grow by 1.5 times upon reallocation. */
 #define GROW(x) (3 * (x) / 2)
@@ -300,6 +302,8 @@
   grpc_linked_mdelem *l;
   grpc_linked_mdelem *next;
 
+  GPR_TIMER_BEGIN("grpc_metadata_batch_filter", 0);
+
   assert_valid_list(&batch->list);
   assert_valid_list(&batch->garbage);
   for (l = batch->list.head; l; l = next) {
@@ -328,4 +332,6 @@
   }
   assert_valid_list(&batch->list);
   assert_valid_list(&batch->garbage);
+
+  GPR_TIMER_END("grpc_metadata_batch_filter", 0);
 }
diff --git a/src/core/tsi/fake_transport_security.c b/src/core/tsi/fake_transport_security.c
index a40268a..99e28ab 100644
--- a/src/core/tsi/fake_transport_security.c
+++ b/src/core/tsi/fake_transport_security.c
@@ -121,7 +121,7 @@
   buf[3] = (unsigned char)((value >> 24) & 0xFF);
   buf[2] = (unsigned char)((value >> 16) & 0xFF);
   buf[1] = (unsigned char)((value >> 8) & 0xFF);
-  buf[0] = (unsigned char)((value) & 0xFF);
+  buf[0] = (unsigned char)((value)&0xFF);
 }
 
 static void tsi_fake_frame_reset(tsi_fake_frame *frame, int needs_draining) {
diff --git a/src/cpp/client/channel.cc b/src/cpp/client/channel.cc
index dc8e304..c7974d6 100644
--- a/src/cpp/client/channel.cc
+++ b/src/cpp/client/channel.cc
@@ -78,7 +78,6 @@
                                       context->raw_deadline(), nullptr);
   }
   grpc_census_call_set_context(c_call, context->census_context());
-  GRPC_TIMER_MARK(GRPC_PTAG_CPP_CALL_CREATED, c_call);
   context->set_call(c_call, shared_from_this());
   return Call(c_call, this, cq);
 }
@@ -87,11 +86,9 @@
   static const size_t MAX_OPS = 8;
   size_t nops = 0;
   grpc_op cops[MAX_OPS];
-  GRPC_TIMER_BEGIN(GRPC_PTAG_CPP_PERFORM_OPS, call->call());
   ops->FillOps(cops, &nops);
   GPR_ASSERT(GRPC_CALL_OK ==
              grpc_call_start_batch(call->call(), cops, nops, ops, nullptr));
-  GRPC_TIMER_END(GRPC_PTAG_CPP_PERFORM_OPS, call->call());
 }
 
 void* Channel::RegisterMethod(const char* method) {
diff --git a/src/cpp/proto/proto_utils.cc b/src/cpp/proto/proto_utils.cc
index 4131fbe..b1330fd 100644
--- a/src/cpp/proto/proto_utils.cc
+++ b/src/cpp/proto/proto_utils.cc
@@ -42,6 +42,8 @@
 #include <grpc/support/port_platform.h>
 #include <grpc++/support/config.h>
 
+#include "src/core/profiling/timers.h"
+
 const int kMaxBufferLength = 8192;
 
 class GrpcBufferWriter GRPC_FINAL
@@ -158,6 +160,7 @@
 
 Status SerializeProto(const grpc::protobuf::Message& msg,
                       grpc_byte_buffer** bp) {
+  GPR_TIMER_SCOPE("SerializeProto", 0);
   int byte_size = msg.ByteSize();
   if (byte_size <= kMaxBufferLength) {
     gpr_slice slice = gpr_slice_malloc(byte_size);
@@ -176,6 +179,7 @@
 
 Status DeserializeProto(grpc_byte_buffer* buffer, grpc::protobuf::Message* msg,
                         int max_message_size) {
+  GPR_TIMER_SCOPE("DeserializeProto", 0);
   if (!buffer) {
     return Status(StatusCode::INTERNAL, "No payload");
   }
diff --git a/src/cpp/server/server.cc b/src/cpp/server/server.cc
index e09744b..f5063a0 100644
--- a/src/cpp/server/server.cc
+++ b/src/cpp/server/server.cc
@@ -540,6 +540,7 @@
 void Server::RunRpc() {
   // Wait for one more incoming rpc.
   bool ok;
+  GPR_TIMER_SCOPE("Server::RunRpc", 0);
   auto* mrd = SyncRequest::Wait(&cq_, &ok);
   if (mrd) {
     ScheduleCallback();
@@ -555,6 +556,7 @@
           mrd->TeardownRequest();
         }
       }
+      GPR_TIMER_SCOPE("cd.Run()", 0);
       cd.Run();
     }
   }
diff --git a/test/core/profiling/timers_test.c b/test/core/profiling/timers_test.c
index b79cde6..7070fe4 100644
--- a/test/core/profiling/timers_test.c
+++ b/test/core/profiling/timers_test.c
@@ -54,15 +54,15 @@
       for (j = 0; j < advance; j++) {
         switch (state[i]) {
           case 0:
-            GRPC_TIMER_MARK(STATE_0, i);
+            GPR_TIMER_MARK(STATE_0, i);
             state[i]++;
             break;
           case 1:
-            GRPC_TIMER_MARK(STATE_1, i);
+            GPR_TIMER_MARK(STATE_1, i);
             state[i]++;
             break;
           case 2:
-            GRPC_TIMER_MARK(STATE_2, i);
+            GPR_TIMER_MARK(STATE_2, i);
             state[i]++;
             break;
           case 3:
@@ -76,8 +76,8 @@
 
 int main(int argc, char **argv) {
   grpc_test_init(argc, argv);
-  grpc_timers_global_init();
+  gpr_timers_global_init();
   test_log_events(1000000);
-  grpc_timers_global_destroy();
+  gpr_timers_global_destroy();
   return 0;
 }
diff --git a/test/cpp/qps/client_sync.cc b/test/cpp/qps/client_sync.cc
index beef604..3c33a1c 100644
--- a/test/cpp/qps/client_sync.cc
+++ b/test/cpp/qps/client_sync.cc
@@ -59,6 +59,8 @@
 #include "test/cpp/qps/interarrival.h"
 #include "test/cpp/qps/timer.h"
 
+#include "src/core/profiling/timers.h"
+
 namespace grpc {
 namespace testing {
 
@@ -99,6 +101,7 @@
     WaitToIssue(thread_idx);
     auto* stub = channels_[thread_idx % channels_.size()].get_stub();
     double start = Timer::Now();
+    GPR_TIMER_SCOPE("SynchronousUnaryClient::ThreadFunc", 0);
     grpc::ClientContext context;
     grpc::Status s =
         stub->UnaryCall(&context, request_, &responses_[thread_idx]);
@@ -135,6 +138,7 @@
 
   bool ThreadFunc(Histogram* histogram, size_t thread_idx) GRPC_OVERRIDE {
     WaitToIssue(thread_idx);
+    GPR_TIMER_SCOPE("SynchronousStreamingClient::ThreadFunc", 0);
     double start = Timer::Now();
     if (stream_[thread_idx]->Write(request_) &&
         stream_[thread_idx]->Read(&responses_[thread_idx])) {
diff --git a/tools/doxygen/Doxyfile.core.internal b/tools/doxygen/Doxyfile.core.internal
index 5fd3ee7..d3be97e 100644
--- a/tools/doxygen/Doxyfile.core.internal
+++ b/tools/doxygen/Doxyfile.core.internal
@@ -852,7 +852,6 @@
 src/core/json/json_common.h \
 src/core/json/json_reader.h \
 src/core/json/json_writer.h \
-src/core/profiling/timers.h \
 src/core/statistics/census_interface.h \
 src/core/statistics/census_rpc_stats.h \
 src/core/surface/api_trace.h \
@@ -991,8 +990,6 @@
 src/core/json/json_reader.c \
 src/core/json/json_string.c \
 src/core/json/json_writer.c \
-src/core/profiling/basic_timers.c \
-src/core/profiling/stap_timers.c \
 src/core/surface/api_trace.c \
 src/core/surface/byte_buffer.c \
 src/core/surface/byte_buffer_queue.c \
@@ -1069,6 +1066,7 @@
 include/grpc/support/tls_msvc.h \
 include/grpc/support/tls_pthread.h \
 include/grpc/support/useful.h \
+src/core/profiling/timers.h \
 src/core/support/block_annotate.h \
 src/core/support/env.h \
 src/core/support/file.h \
@@ -1078,6 +1076,8 @@
 src/core/support/string_win32.h \
 src/core/support/thd_internal.h \
 src/core/support/time_precise.h \
+src/core/profiling/basic_timers.c \
+src/core/profiling/stap_timers.c \
 src/core/support/alloc.c \
 src/core/support/cmdline.c \
 src/core/support/cpu_iphone.c \
@@ -1113,6 +1113,7 @@
 src/core/support/thd_win32.c \
 src/core/support/time.c \
 src/core/support/time_posix.c \
+src/core/support/time_precise.c \
 src/core/support/time_win32.c \
 src/core/support/tls_pthread.c
 
diff --git a/tools/profile_analyzer/profile_analyzer.py b/tools/profile_analyzer/profile_analyzer.py
deleted file mode 100755
index e5e9a28..0000000
--- a/tools/profile_analyzer/profile_analyzer.py
+++ /dev/null
@@ -1,188 +0,0 @@
-#!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""
-Read GRPC basic profiles, analyze the data.
-
-Usage:
-  bins/basicprof/qps_smoke_test > log
-  cat log | tools/profile_analyzer/profile_analyzer.py
-"""
-
-
-import collections
-import itertools
-import math
-import re
-
-# Create a regex to parse output of the C core basic profiler,
-# as defined in src/core/profiling/basic_timers.c.
-_RE_LINE = re.compile(r'GRPC_LAT_PROF ' +
-                      r'([0-9]+\.[0-9]+) 0x([0-9a-f]+) ([{}.!]) ([0-9]+) ' +
-                      r'([^ ]+) ([^ ]+) ([0-9]+)')
-
-Entry = collections.namedtuple(
-    'Entry',
-    ['time', 'thread', 'type', 'tag', 'id', 'file', 'line'])
-
-
-class ImportantMark(object):
-  def __init__(self, entry, stack):
-    self._entry = entry
-    self._pre_stack = stack
-    self._post_stack = list()
-    self._n = len(stack)  # we'll also compute times to that many closing }s
-
-  @property
-  def entry(self):
-    return self._entry
-
-  @property
-  def max_depth(self):
-    return self._n
-
-  def append_post_entry(self, post_entry):
-    if self._n > 0 and post_entry.thread == self._entry.thread:
-      self._post_stack.append(post_entry)
-      self._n -= 1
-
-  def get_deltas(self):
-    pre_and_post_stacks = itertools.chain(self._pre_stack, self._post_stack)
-    return collections.OrderedDict((stack_entry,
-                                   abs(self._entry.time - stack_entry.time))
-                                   for stack_entry in pre_and_post_stacks)
-
-
-def print_block_statistics(block_times):
-  print '{:<12s}  {:>12s} {:>12s} {:>12s} {:>12s}'.format(
-          'Block tag', '50th p.', '90th p.', '95th p.', '99th p.')
-  for tag, tag_times in sorted(block_times.iteritems()):
-    times = sorted(tag_times)
-    print '{:<12d}: {:>12.3f} {:>12.3f} {:>12.3f} {:>12.3f}'.format(
-            tag, percentile(times, 50), percentile(times, 90),
-            percentile(times, 95), percentile(times, 99))
-  print
-
-def print_grouped_imark_statistics(group_key, imarks_group):
-  values = collections.OrderedDict()
-  for imark in imarks_group:
-    deltas = imark.get_deltas()
-    for relative_entry, time_delta_us in deltas.iteritems():
-      key = '{tag} {type} ({file}:{line})'.format(**relative_entry._asdict())
-      l = values.setdefault(key, list())
-      l.append(time_delta_us)
-
-  print group_key
-  print '{:<50s}  {:>12s} {:>12s} {:>12s} {:>12s}'.format(
-        'Relative mark', '50th p.', '90th p.', '95th p.', '99th p.')
-  for key, time_values in values.iteritems():
-    time_values = sorted(time_values)
-    print '{:<50s}: {:>12.3f} {:>12.3f} {:>12.3f} {:>12.3f}'.format(
-          key, percentile(time_values, 50), percentile(time_values, 90),
-          percentile(time_values, 95), percentile(time_values, 99))
-  print
-
-def percentile(vals, percent):
-  """ Calculates the interpolated percentile given a sorted sequence and a
-  percent (in the usual 0-100 range)."""
-  assert vals, "Empty input sequence."
-  percent /= 100.0
-  k = (len(vals)-1) * percent
-  f = math.floor(k)
-  c = math.ceil(k)
-  if f == c:
-      return vals[int(k)]
-  # else, interpolate
-  d0 = vals[int(f)] * (c-k)
-  d1 = vals[int(c)] * (k-f)
-  return d0 + d1
-
-def entries(f):
-  for line in f:
-    m = _RE_LINE.match(line)
-    if not m: continue
-    yield Entry(time=float(m.group(1)),
-                thread=m.group(2),
-                type=m.group(3),
-                tag=int(m.group(4)),
-                id=m.group(5),
-                file=m.group(6),
-                line=m.group(7))
-
-def main(f):
-  percentiles = (50, 90, 95, 99)
-  threads = collections.defaultdict(lambda: collections.defaultdict(list))
-  times = collections.defaultdict(list)
-  important_marks = collections.defaultdict(list)
-  stack_depth = collections.defaultdict(int)
-  for entry in entries(f):
-    thread = threads[entry.thread]
-    if entry.type == '{':
-      thread[entry.tag].append(entry)
-      stack_depth[entry.thread] += 1
-    if entry.type == '!':
-      # Save a snapshot of the current stack inside a new ImportantMark instance.
-      # Get all entries _for any tag in the thread_.
-      stack = [e for entries_for_tag in thread.itervalues()
-                 for e in entries_for_tag]
-      imark_group_key = '{tag}/{thread}@{file}:{line}'.format(**entry._asdict())
-      important_marks[imark_group_key].append(ImportantMark(entry, stack))
-    elif entry.type == '}':
-      last = thread[entry.tag].pop()
-      times[entry.tag].append(entry.time - last.time)
-      # only access the last "depth" imarks for the tag.
-      depth = stack_depth[entry.thread]
-      for imarks_group in important_marks.itervalues():
-        for imark in imarks_group[-depth:]:
-          # if at a '}' deeper than where the current "imark" was found, ignore.
-          if depth > imark.max_depth: continue
-          imark.append_post_entry(entry)
-      stack_depth[entry.thread] -= 1
-
-  print
-  print 'Block marks:'
-  print '============'
-  print_block_statistics(times)
-
-  print
-  print 'Important marks:'
-  print '================'
-  for group_key, imarks_group in important_marks.iteritems():
-    print_grouped_imark_statistics(group_key, imarks_group)
-
-
-if __name__ == '__main__':
-  # If invoked without arguments, read off sys.stdin. If one argument is given,
-  # take it as a file name and open it for reading.
-  import sys
-  f = sys.stdin
-  if len(sys.argv) == 2:
-    f = open(sys.argv[1], 'r')
-  main(f)
diff --git a/tools/profiling/latency_profile/profile_analyzer.py b/tools/profiling/latency_profile/profile_analyzer.py
new file mode 100755
index 0000000..b12e107
--- /dev/null
+++ b/tools/profiling/latency_profile/profile_analyzer.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python2.7
+import argparse
+import collections
+import hashlib
+import itertools
+import json
+import math
+import tabulate
+import time
+
+
+SELF_TIME = object()
+TIME_FROM_SCOPE_START = object()
+TIME_TO_SCOPE_END = object()
+TIME_FROM_STACK_START = object()
+TIME_TO_STACK_END = object()
+
+
+argp = argparse.ArgumentParser(description='Process output of basic_prof builds')
+argp.add_argument('--source', default='latency_trace.txt', type=str)
+argp.add_argument('--fmt', choices=tabulate.tabulate_formats, default='simple')
+args = argp.parse_args()
+
+class LineItem(object):
+
+  def __init__(self, line, indent):
+    self.tag = line['tag']
+    self.indent = indent
+    self.start_time = line['t']
+    self.end_time = None
+    self.important = line['imp']
+    self.filename = line['file']
+    self.fileline = line['line']
+    self.times = {}
+
+
+class ScopeBuilder(object):
+
+  def __init__(self, call_stack_builder, line):
+    self.call_stack_builder = call_stack_builder
+    self.indent = len(call_stack_builder.stk)
+    self.top_line = LineItem(line, self.indent)
+    call_stack_builder.lines.append(self.top_line)
+    self.first_child_pos = len(call_stack_builder.lines)
+
+  def mark(self, line):
+    line_item = LineItem(line, self.indent + 1)
+    line_item.end_time = line_item.start_time
+    self.call_stack_builder.lines.append(line_item)
+
+  def finish(self, line):
+    assert line['tag'] == self.top_line.tag, 'expected %s, got %s' % (self.top_line.tag, line['tag'])
+    final_time_stamp = line['t']
+    assert self.top_line.end_time is None
+    self.top_line.end_time = final_time_stamp
+    assert SELF_TIME not in self.top_line.times
+    self.top_line.times[SELF_TIME] = final_time_stamp - self.top_line.start_time
+    for line in self.call_stack_builder.lines[self.first_child_pos:]:
+      if TIME_FROM_SCOPE_START not in line.times:
+        line.times[TIME_FROM_SCOPE_START] = line.start_time - self.top_line.start_time
+        line.times[TIME_TO_SCOPE_END] = final_time_stamp - line.end_time
+
+
+class CallStackBuilder(object):
+
+  def __init__(self):
+    self.stk = []
+    self.signature = hashlib.md5()
+    self.lines = []
+
+  def finish(self):
+    start_time = self.lines[0].start_time
+    end_time = self.lines[0].end_time
+    self.signature = self.signature.hexdigest()
+    for line in self.lines:
+      line.times[TIME_FROM_STACK_START] = line.start_time - start_time
+      line.times[TIME_TO_STACK_END] = end_time - line.end_time
+
+  def add(self, line):
+    line_type = line['type']
+    self.signature.update(line_type)
+    self.signature.update(line['tag'])
+    if line_type == '{':
+      self.stk.append(ScopeBuilder(self, line))
+      return False
+    elif line_type == '}':
+      self.stk.pop().finish(line)
+      if not self.stk:
+        self.finish()
+        return True
+      return False
+    elif line_type == '.' or line_type == '!':
+      self.stk[-1].mark(line)
+      return False
+    else:
+      raise Exception('Unknown line type: \'%s\'' % line_type)
+
+
+class CallStack(object):
+
+  def __init__(self, initial_call_stack_builder):
+    self.count = 1
+    self.signature = initial_call_stack_builder.signature
+    self.lines = initial_call_stack_builder.lines
+    for line in self.lines:
+      for key, val in line.times.items():
+        line.times[key] = [val]
+
+  def add(self, call_stack_builder):
+    assert self.signature == call_stack_builder.signature
+    self.count += 1
+    assert len(self.lines) == len(call_stack_builder.lines)
+    for lsum, line in itertools.izip(self.lines, call_stack_builder.lines):
+      assert lsum.tag == line.tag
+      assert lsum.times.keys() == line.times.keys()
+      for k, lst in lsum.times.iteritems():
+        lst.append(line.times[k])
+
+  def finish(self):
+    for line in self.lines:
+      for lst in line.times.itervalues():
+        lst.sort()
+
+builder = collections.defaultdict(CallStackBuilder)
+call_stacks = collections.defaultdict(CallStack)
+
+lines = 0
+start = time.time()
+with open(args.source) as f:
+  for line in f:
+    lines += 1
+    inf = json.loads(line)
+    thd = inf['thd']
+    cs = builder[thd]
+    if cs.add(inf):
+      if cs.signature in call_stacks:
+        call_stacks[cs.signature].add(cs)
+      else:
+        call_stacks[cs.signature] = CallStack(cs)
+      del builder[thd]
+time_taken = time.time() - start
+
+call_stacks = sorted(call_stacks.values(), key=lambda cs: cs.count, reverse=True)
+total_stacks = 0
+for cs in call_stacks:
+  total_stacks += cs.count
+  cs.finish()
+
+def percentile(N, percent, key=lambda x:x):
+    """
+    Find the percentile of a list of values.
+
+    @parameter N - is a list of values. Note N MUST BE already sorted.
+    @parameter percent - a float value from 0.0 to 1.0.
+    @parameter key - optional key function to compute value from each element of N.
+
+    @return - the percentile of the values
+    """
+    if not N:
+        return None
+    k = (len(N)-1) * percent
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return key(N[int(k)])
+    d0 = key(N[int(f)]) * (c-k)
+    d1 = key(N[int(c)]) * (k-f)
+    return d0+d1
+
+def tidy_tag(tag):
+  if tag[0:10] == 'GRPC_PTAG_':
+    return tag[10:]
+  return tag
+
+def time_string(values):
+  num_values = len(values)
+  return '%.1f/%.1f/%.1f' % (
+      1e6 * percentile(values, 0.5),
+      1e6 * percentile(values, 0.9),
+      1e6 * percentile(values, 0.99))
+
+def time_format(idx):
+  def ent(line, idx=idx):
+    if idx in line.times:
+      return time_string(line.times[idx])
+    return ''
+  return ent
+
+FORMAT = [
+  ('TAG', lambda line: '..'*line.indent + tidy_tag(line.tag)),
+  ('LOC', lambda line: '%s:%d' % (line.filename[line.filename.rfind('/')+1:], line.fileline)),
+  ('FROM_STACK_START', time_format(TIME_FROM_STACK_START)),
+  ('SELF', time_format(SELF_TIME)),
+  ('TO_STACK_END', time_format(TIME_TO_STACK_END)),
+  ('FROM_SCOPE_START', time_format(TIME_FROM_SCOPE_START)),
+  ('SELF', time_format(SELF_TIME)),
+  ('TO_SCOPE_END', time_format(TIME_TO_SCOPE_END)),
+]
+
+BANNER = {
+    'simple': 'Count: %(count)d',
+    'html': '<h1>Count: %(count)d</h1>'
+}
+
+if args.fmt == 'html':
+  print '<html>'
+  print '<head>'
+  print '<title>Profile Report</title>'
+  print '</head>'
+
+accounted_for = 0
+for cs in call_stacks:
+  if args.fmt in BANNER:
+    print BANNER[args.fmt] % {
+        'count': cs.count,
+    }
+  header, _ = zip(*FORMAT)
+  table = []
+  for line in cs.lines:
+    fields = []
+    for _, fn in FORMAT:
+      fields.append(fn(line))
+    table.append(fields)
+  print tabulate.tabulate(table, header, tablefmt=args.fmt)
+  accounted_for += cs.count
+  if accounted_for > .99 * total_stacks:
+    break
+
+if args.fmt == 'html':
+  print '</html>'
+
diff --git a/tools/profiling/latency_profile/run_latency_profile.sh b/tools/profiling/latency_profile/run_latency_profile.sh
new file mode 100755
index 0000000..41686be
--- /dev/null
+++ b/tools/profiling/latency_profile/run_latency_profile.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -ex
+
+cd $(dirname $0)/../../..
+
+BINS="sync_unary_ping_pong_test sync_streaming_ping_pong_test"
+CPUS=`python -c 'import multiprocessing; print multiprocessing.cpu_count()'`
+
+make CONFIG=basicprof -j$CPUS $BINS
+
+mkdir -p reports
+
+echo '<html><head></head><body>' > reports/index.html
+for bin in $BINS
+do
+  bins/basicprof/$bin
+  mv latency_trace.txt $bin.trace
+  echo "<a href='$bin.txt'>$bin</a><br/>" >> reports/index.html
+done
+for bin in $BINS
+do
+  tools/profiling/latency_profile/profile_analyzer.py \
+    --source=$bin.trace --fmt=simple > reports/$bin.txt &
+done
+echo '</body></html>' >> reports/index.html
+
+wait
+
diff --git a/tools/run_tests/sources_and_headers.json b/tools/run_tests/sources_and_headers.json
index 76d66ed..32c4903 100644
--- a/tools/run_tests/sources_and_headers.json
+++ b/tools/run_tests/sources_and_headers.json
@@ -12161,6 +12161,7 @@
       "include/grpc/support/tls_msvc.h", 
       "include/grpc/support/tls_pthread.h", 
       "include/grpc/support/useful.h", 
+      "src/core/profiling/timers.h", 
       "src/core/support/block_annotate.h", 
       "src/core/support/env.h", 
       "src/core/support/file.h", 
@@ -12201,6 +12202,9 @@
       "include/grpc/support/tls_msvc.h", 
       "include/grpc/support/tls_pthread.h", 
       "include/grpc/support/useful.h", 
+      "src/core/profiling/basic_timers.c", 
+      "src/core/profiling/stap_timers.c", 
+      "src/core/profiling/timers.h", 
       "src/core/support/alloc.c", 
       "src/core/support/block_annotate.h", 
       "src/core/support/cmdline.c", 
@@ -12244,6 +12248,7 @@
       "src/core/support/thd_win32.c", 
       "src/core/support/time.c", 
       "src/core/support/time_posix.c", 
+      "src/core/support/time_precise.c", 
       "src/core/support/time_precise.h", 
       "src/core/support/time_win32.c", 
       "src/core/support/tls_pthread.c"
@@ -12350,7 +12355,6 @@
       "src/core/json/json_common.h", 
       "src/core/json/json_reader.h", 
       "src/core/json/json_writer.h", 
-      "src/core/profiling/timers.h", 
       "src/core/security/auth_filters.h", 
       "src/core/security/base64.h", 
       "src/core/security/credentials.h", 
@@ -12569,9 +12573,6 @@
       "src/core/json/json_string.c", 
       "src/core/json/json_writer.c", 
       "src/core/json/json_writer.h", 
-      "src/core/profiling/basic_timers.c", 
-      "src/core/profiling/stap_timers.c", 
-      "src/core/profiling/timers.h", 
       "src/core/security/auth_filters.h", 
       "src/core/security/base64.c", 
       "src/core/security/base64.h", 
@@ -12857,7 +12858,6 @@
       "src/core/json/json_common.h", 
       "src/core/json/json_reader.h", 
       "src/core/json/json_writer.h", 
-      "src/core/profiling/timers.h", 
       "src/core/statistics/census_interface.h", 
       "src/core/statistics/census_rpc_stats.h", 
       "src/core/surface/api_trace.h", 
@@ -13061,9 +13061,6 @@
       "src/core/json/json_string.c", 
       "src/core/json/json_writer.c", 
       "src/core/json/json_writer.h", 
-      "src/core/profiling/basic_timers.c", 
-      "src/core/profiling/stap_timers.c", 
-      "src/core/profiling/timers.h", 
       "src/core/statistics/census_interface.h", 
       "src/core/statistics/census_rpc_stats.h", 
       "src/core/surface/api_trace.c", 
diff --git a/vsprojects/vcxproj/gpr/gpr.vcxproj b/vsprojects/vcxproj/gpr/gpr.vcxproj
index 479286f..fa495db 100644
--- a/vsprojects/vcxproj/gpr/gpr.vcxproj
+++ b/vsprojects/vcxproj/gpr/gpr.vcxproj
@@ -163,6 +163,7 @@
     <ClInclude Include="..\..\..\include\grpc\support\useful.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
     <ClInclude Include="..\..\..\src\core\support\block_annotate.h" />
     <ClInclude Include="..\..\..\src\core\support\env.h" />
     <ClInclude Include="..\..\..\src\core\support\file.h" />
@@ -174,6 +175,10 @@
     <ClInclude Include="..\..\..\src\core\support\time_precise.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
+    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
+    </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\alloc.c">
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\cmdline.c">
@@ -244,6 +249,8 @@
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\time_posix.c">
     </ClCompile>
+    <ClCompile Include="..\..\..\src\core\support\time_precise.c">
+    </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\time_win32.c">
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\tls_pthread.c">
diff --git a/vsprojects/vcxproj/gpr/gpr.vcxproj.filters b/vsprojects/vcxproj/gpr/gpr.vcxproj.filters
index be5bb51..ee03e45 100644
--- a/vsprojects/vcxproj/gpr/gpr.vcxproj.filters
+++ b/vsprojects/vcxproj/gpr/gpr.vcxproj.filters
@@ -1,6 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
+    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
+      <Filter>src\core\profiling</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
+      <Filter>src\core\profiling</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\alloc.c">
       <Filter>src\core\support</Filter>
     </ClCompile>
@@ -106,6 +112,9 @@
     <ClCompile Include="..\..\..\src\core\support\time_posix.c">
       <Filter>src\core\support</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\src\core\support\time_precise.c">
+      <Filter>src\core\support</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\src\core\support\time_win32.c">
       <Filter>src\core\support</Filter>
     </ClCompile>
@@ -197,6 +206,9 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
+      <Filter>src\core\profiling</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\src\core\support\block_annotate.h">
       <Filter>src\core\support</Filter>
     </ClInclude>
@@ -242,6 +254,9 @@
     <Filter Include="src\core">
       <UniqueIdentifier>{c5e1baa7-de77-beb1-9675-942261648f79}</UniqueIdentifier>
     </Filter>
+    <Filter Include="src\core\profiling">
+      <UniqueIdentifier>{93b7086c-8c8a-6bbf-fb14-1f166bf0146a}</UniqueIdentifier>
+    </Filter>
     <Filter Include="src\core\support">
       <UniqueIdentifier>{bb116f2a-ea2a-c233-82da-0c54e3cbfec1}</UniqueIdentifier>
     </Filter>
diff --git a/vsprojects/vcxproj/grpc/grpc.vcxproj b/vsprojects/vcxproj/grpc/grpc.vcxproj
index 88b883f..23bcd0c 100644
--- a/vsprojects/vcxproj/grpc/grpc.vcxproj
+++ b/vsprojects/vcxproj/grpc/grpc.vcxproj
@@ -338,7 +338,6 @@
     <ClInclude Include="..\..\..\src\core\json\json_common.h" />
     <ClInclude Include="..\..\..\src\core\json\json_reader.h" />
     <ClInclude Include="..\..\..\src\core\json\json_writer.h" />
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
     <ClInclude Include="..\..\..\src\core\statistics\census_interface.h" />
     <ClInclude Include="..\..\..\src\core\statistics\census_rpc_stats.h" />
     <ClInclude Include="..\..\..\src\core\surface\api_trace.h" />
@@ -576,10 +575,6 @@
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\json\json_writer.c">
     </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-    </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\byte_buffer.c">
diff --git a/vsprojects/vcxproj/grpc/grpc.vcxproj.filters b/vsprojects/vcxproj/grpc/grpc.vcxproj.filters
index 1a46ac8..1cfccf0 100644
--- a/vsprojects/vcxproj/grpc/grpc.vcxproj.filters
+++ b/vsprojects/vcxproj/grpc/grpc.vcxproj.filters
@@ -292,12 +292,6 @@
     <ClCompile Include="..\..\..\src\core\json\json_writer.c">
       <Filter>src\core\json</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
       <Filter>src\core\surface</Filter>
     </ClCompile>
@@ -725,9 +719,6 @@
     <ClInclude Include="..\..\..\src\core\json\json_writer.h">
       <Filter>src\core\json</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
-      <Filter>src\core\profiling</Filter>
-    </ClInclude>
     <ClInclude Include="..\..\..\src\core\statistics\census_interface.h">
       <Filter>src\core\statistics</Filter>
     </ClInclude>
@@ -899,9 +890,6 @@
     <Filter Include="src\core\json">
       <UniqueIdentifier>{e665cc0e-b994-d7c5-cc18-2007392019f0}</UniqueIdentifier>
     </Filter>
-    <Filter Include="src\core\profiling">
-      <UniqueIdentifier>{87674b72-0f05-0469-481a-bd8c7af9ad80}</UniqueIdentifier>
-    </Filter>
     <Filter Include="src\core\security">
       <UniqueIdentifier>{1d850ac6-e639-4eab-5338-4ba40272fcc9}</UniqueIdentifier>
     </Filter>
diff --git a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj
index cddd003..e308a03 100644
--- a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj
+++ b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj
@@ -317,7 +317,6 @@
     <ClInclude Include="..\..\..\src\core\json\json_common.h" />
     <ClInclude Include="..\..\..\src\core\json\json_reader.h" />
     <ClInclude Include="..\..\..\src\core\json\json_writer.h" />
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
     <ClInclude Include="..\..\..\src\core\statistics\census_interface.h" />
     <ClInclude Include="..\..\..\src\core\statistics\census_rpc_stats.h" />
     <ClInclude Include="..\..\..\src\core\surface\api_trace.h" />
@@ -515,10 +514,6 @@
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\json\json_writer.c">
     </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-    </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
     </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\byte_buffer.c">
diff --git a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters
index 01e0d8d..370d1f6 100644
--- a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters
+++ b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters
@@ -232,12 +232,6 @@
     <ClCompile Include="..\..\..\src\core\json\json_writer.c">
       <Filter>src\core\json</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
       <Filter>src\core\surface</Filter>
     </ClCompile>
@@ -623,9 +617,6 @@
     <ClInclude Include="..\..\..\src\core\json\json_writer.h">
       <Filter>src\core\json</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
-      <Filter>src\core\profiling</Filter>
-    </ClInclude>
     <ClInclude Include="..\..\..\src\core\statistics\census_interface.h">
       <Filter>src\core\statistics</Filter>
     </ClInclude>
@@ -797,9 +788,6 @@
     <Filter Include="src\core\json">
       <UniqueIdentifier>{443ffc61-1bea-2477-6e54-1ddf8c139264}</UniqueIdentifier>
     </Filter>
-    <Filter Include="src\core\profiling">
-      <UniqueIdentifier>{7f91d9bf-c9de-835a-d74d-b16f843b89a9}</UniqueIdentifier>
-    </Filter>
     <Filter Include="src\core\statistics">
       <UniqueIdentifier>{e084164c-a069-00e3-db35-4e0b1cd6f0b7}</UniqueIdentifier>
     </Filter>