Second batch of feedback.
diff --git a/include/grpc/support/alloc.h b/include/grpc/support/alloc.h
index c758065..09ea975 100644
--- a/include/grpc/support/alloc.h
+++ b/include/grpc/support/alloc.h
@@ -46,8 +46,8 @@
 void gpr_free(void *ptr);
 /* realloc, never returns NULL */
 void *gpr_realloc(void *p, size_t size);
-/* aligned malloc, never returns NULL, alignment must be power of 2 */
-void *gpr_malloc_aligned(size_t size, size_t alignment);
+/* aligned malloc, never returns NULL, will align to 1 << alignment_log */
+void *gpr_malloc_aligned(size_t size, size_t alignment_log);
 /* free memory allocated by gpr_malloc_aligned */
 void gpr_free_aligned(void *ptr);
 
diff --git a/include/grpc/support/atm_win32.h b/include/grpc/support/atm_win32.h
index acacf12..9bb1cfe 100644
--- a/include/grpc/support/atm_win32.h
+++ b/include/grpc/support/atm_win32.h
@@ -93,11 +93,13 @@
 static __inline gpr_atm gpr_atm_full_fetch_add(gpr_atm *p, gpr_atm delta) {
   /* Use a CAS operation to get pointer-sized fetch and add */
   gpr_atm old;
+#ifdef GPR_ARCH_64
   do {
     old = *p;
-#ifdef GPR_ARCH_64
   } while (old != (gpr_atm)InterlockedCompareExchange64(p, old + delta, old));
 #else
+  do {
+    old = *p;
   } while (old != (gpr_atm)InterlockedCompareExchange(p, old + delta, old));
 #endif
   return old;
diff --git a/include/grpc/support/port_platform.h b/include/grpc/support/port_platform.h
index 27efa29..0a65175 100644
--- a/include/grpc/support/port_platform.h
+++ b/include/grpc/support/port_platform.h
@@ -147,16 +147,18 @@
 #include <stdint.h>
 
 /* Cache line alignment */
-#ifndef GPR_CACHELINE_SIZE
+#ifndef GPR_CACHELINE_SIZE_LOG
 #if defined(__i386__) || defined(__x86_64__)
-#define GPR_CACHELINE_SIZE 64
+#define GPR_CACHELINE_SIZE_LOG 6
 #endif
-#ifndef GPR_CACHELINE_SIZE
+#ifndef GPR_CACHELINE_SIZE_LOG
 /* A reasonable default guess. Note that overestimates tend to waste more
    space, while underestimates tend to waste more time. */
-#define GPR_CACHELINE_SIZE 64
-#endif /* GPR_CACHELINE_SIZE */
-#endif /* GPR_CACHELINE_SIZE */
+#define GPR_CACHELINE_SIZE_LOG 6
+#endif /* GPR_CACHELINE_SIZE_LOG */
+#endif /* GPR_CACHELINE_SIZE_LOG */
+
+#define GPR_CACHELINE_SIZE (1 << GPR_CACHELINE_SIZE_LOG)
 
 /* scrub GCC_ATOMIC if it's not available on this compiler */
 #if defined(GPR_GCC_ATOMIC) && !defined(__ATOMIC_RELAXED)
diff --git a/include/grpc/support/slice.h b/include/grpc/support/slice.h
index 261e3ba..8a21290 100644
--- a/include/grpc/support/slice.h
+++ b/include/grpc/support/slice.h
@@ -165,7 +165,9 @@
 
 gpr_slice gpr_empty_slice(void);
 
-/* Returns <0 if a < b, ==0 if a == b, >0 if a > b */
+/* Returns <0 if a < b, ==0 if a == b, >0 if a > b
+   The order is arbitrary, and is not guaranteed to be stable across different
+   versions of the API. */
 int gpr_slice_cmp(gpr_slice a, gpr_slice b);
 int gpr_slice_str_cmp(gpr_slice a, const char *b);
 
diff --git a/src/core/statistics/census_log.c b/src/core/statistics/census_log.c
index 24e4687..ec56ce3 100644
--- a/src/core/statistics/census_log.c
+++ b/src/core/statistics/census_log.c
@@ -475,11 +475,11 @@
   g_log.block_being_read = NULL;
   gpr_atm_rel_store(&g_log.is_full, 0);
   g_log.core_local_blocks = (cl_core_local_block*)gpr_malloc_aligned(
-      g_log.num_cores * sizeof(cl_core_local_block), GPR_CACHELINE_SIZE);
+      g_log.num_cores * sizeof(cl_core_local_block), GPR_CACHELINE_SIZE_LOG);
   memset(g_log.core_local_blocks, 0,
          g_log.num_cores * sizeof(cl_core_local_block));
   g_log.blocks = (cl_block*)gpr_malloc_aligned(
-      g_log.num_blocks * sizeof(cl_block), GPR_CACHELINE_SIZE);
+      g_log.num_blocks * sizeof(cl_block), GPR_CACHELINE_SIZE_LOG);
   memset(g_log.blocks, 0, g_log.num_blocks * sizeof(cl_block));
   g_log.buffer = gpr_malloc(g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
   memset(g_log.buffer, 0, g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
diff --git a/src/core/support/alloc.c b/src/core/support/alloc.c
index 44f343b..a19a014 100644
--- a/src/core/support/alloc.c
+++ b/src/core/support/alloc.c
@@ -54,7 +54,8 @@
   return p;
 }
 
-void *gpr_malloc_aligned(size_t size, size_t alignment) {
+void *gpr_malloc_aligned(size_t size, size_t alignment_log) {
+  size_t alignment = 1 << alignment_log;
   size_t extra = alignment - 1 + sizeof(void *);
   void *p = gpr_malloc(size + extra);
   void **ret = (void **)(((gpr_uintptr)p + extra) & ~(alignment - 1));
diff --git a/src/core/support/cpu_linux.c b/src/core/support/cpu_linux.c
index ef6bf9c..37e840d 100644
--- a/src/core/support/cpu_linux.c
+++ b/src/core/support/cpu_linux.c
@@ -39,25 +39,28 @@
 
 #ifdef GPR_CPU_LINUX
 
-#include <grpc/support/cpu.h>
-
 #include <sched.h>
 #include <errno.h>
 #include <unistd.h>
 #include <string.h>
 
+#include <grpc/support/cpu.h>
 #include <grpc/support/log.h>
+#include <grpc/support/sync.h>
+
+static int ncpus = 0;
+
+static void init_num_cpus() {
+  ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+  if (ncpus < 1) {
+    gpr_log(GPR_ERROR, "Cannot determine number of CPUs: assuming 1");
+    ncpus = 1;
+  }
+}
 
 unsigned gpr_cpu_num_cores(void) {
-  static int ncpus = 0;
-  /* FIXME: !threadsafe */
-  if (ncpus == 0) {
-    ncpus = sysconf(_SC_NPROCESSORS_ONLN);
-    if (ncpus < 1) {
-      gpr_log(GPR_ERROR, "Cannot determine number of CPUs: assuming 1");
-      ncpus = 1;
-    }
-  }
+  static gpr_once once = GPR_ONCE_INIT;
+  gpr_once_init(&once, init_num_cpus);
   return ncpus;
 }
 
diff --git a/src/core/support/cpu_posix.c b/src/core/support/cpu_posix.c
index 91ce80c..33c7b90 100644
--- a/src/core/support/cpu_posix.c
+++ b/src/core/support/cpu_posix.c
@@ -43,15 +43,19 @@
 
 static __thread char magic_thread_local;
 
-unsigned gpr_cpu_num_cores(void) {
-  static int ncpus = 0;
-  if (ncpus == 0) {
-    ncpus = sysconf(_SC_NPROCESSORS_ONLN);
-    if (ncpus < 1) {
-      gpr_log(GPR_ERROR, "Cannot determine number of CPUs: assuming 1");
-      ncpus = 1;
-    }
+static int ncpus = 0;
+
+static void init_ncpus() {
+  ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+  if (ncpus < 1) {
+    gpr_log(GPR_ERROR, "Cannot determine number of CPUs: assuming 1");
+    ncpus = 1;
   }
+}
+
+unsigned gpr_cpu_num_cores(void) {
+  static gpr_once once = GPR_ONCE_INIT;
+  gpr_once_init(&once, init_num_cpus);
   return ncpus;
 }