src/core/ext/census/mlog.cc - platform/external/grpc-grpc - Gitiles

 /*
  *
  * Copyright 2015 gRPC authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  */

 // Implements an efficient in-memory log, optimized for multiple writers and
 // a single reader. Available log space is divided up in blocks of
 // CENSUS_LOG_2_MAX_RECORD_SIZE bytes. A block can be in one of the following
 // three data structures:
 // - Free blocks (free_block_list)
 // - Blocks with unread data (dirty_block_list)
 // - Blocks currently attached to cores (core_local_blocks[])
 //
 // census_log_start_write() moves a block from core_local_blocks[] to the end of
 // dirty_block_list when block:
 // - is out-of-space OR
 // - has an incomplete record (an incomplete record occurs when a thread calls
 //   census_log_start_write() and is context-switched before calling
 //   census_log_end_write()
 // So, blocks in dirty_block_list are ordered, from oldest to newest, by the
 // time when block is detached from the core.
 //
 // census_log_read_next() first iterates over dirty_block_list and then
 // core_local_blocks[]. It moves completely read blocks from dirty_block_list
 // to free_block_list. Blocks in core_local_blocks[] are not freed, even when
 // completely read.
 //
 // If the log is configured to discard old records and free_block_list is empty,
 // census_log_start_write() iterates over dirty_block_list to allocate a
 // new block. It moves the oldest available block (no pending read/write) to
 // core_local_blocks[].
 //
 // core_local_block_struct is used to implement a map from core id to the block
 // associated with that core. This mapping is advisory. It is possible that the
 // block returned by this mapping is no longer associated with that core. This
 // mapping is updated, lazily, by census_log_start_write().
 //
 // Locking in block struct:
 //
 // Exclusive g_log.lock must be held before calling any functions operating on
 // block structs except census_log_start_write() and census_log_end_write().
 //
 // Writes to a block are serialized via writer_lock. census_log_start_write()
 // acquires this lock and census_log_end_write() releases it. On failure to
 // acquire the lock, writer allocates a new block for the current core and
 // updates core_local_block accordingly.
 //
 // Simultaneous read and write access is allowed. Readers can safely read up to
 // committed bytes (bytes_committed).
 //
 // reader_lock protects the block, currently being read, from getting recycled.
 // start_read() acquires reader_lock and end_read() releases the lock.
 //
 // Read/write access to a block is disabled via try_disable_access(). It returns
 // with both writer_lock and reader_lock held. These locks are subsequently
 // released by enable_access() to enable access to the block.
 //
 // A note on naming: Most function/struct names are prepended by cl_
 // (shorthand for census_log). Further, functions that manipulate structures
 // include the name of the structure, which will be passed as the first
 // argument. E.g. cl_block_initialize() will initialize a cl_block.

 #include "src/core/ext/census/mlog.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/atm.h>
 #include <grpc/support/cpu.h>
 #include <grpc/support/log.h>
 #include <grpc/support/sync.h>
 #include <grpc/support/useful.h>
 #include <stdbool.h>
 #include <string.h>

 // End of platform specific code

 typedef struct census_log_block_list_struct {
   struct census_log_block_list_struct* next;
   struct census_log_block_list_struct* prev;
   struct census_log_block* block;
 } cl_block_list_struct;

 typedef struct census_log_block {
   // Pointer to underlying buffer.
   char* buffer;
   gpr_atm writer_lock;
   gpr_atm reader_lock;
   // Keeps completely written bytes. Declared atomic because accessed
   // simultaneously by reader and writer.
   gpr_atm bytes_committed;
   // Bytes already read.
   size_t bytes_read;
   // Links for list.
   cl_block_list_struct link;
 // We want this structure to be cacheline aligned. We assume the following
 // sizes for the various parts on 32/64bit systems:
 // type                 32b size    64b size
 // char*                   4           8
 // 3x gpr_atm             12          24
 // size_t                  4           8
 // cl_block_list_struct   12          24
 // TOTAL                  32          64
 //
 // Depending on the size of our cacheline and the architecture, we
 // selectively add char buffering to this structure. The size is checked
 // via assert in census_log_initialize().
 #if defined(GPR_ARCH_64)
 #define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 64)
 #else
 #if defined(GPR_ARCH_32)
 #define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 32)
 #else
 #error "Unknown architecture"
 #endif
 #endif
 #if CL_BLOCK_PAD_SIZE > 0
   char padding[CL_BLOCK_PAD_SIZE];
 #endif
 } cl_block;

 // A list of cl_blocks, doubly-linked through cl_block::link.
 typedef struct census_log_block_list {
   int32_t count;            // Number of items in list.
   cl_block_list_struct ht;  // head/tail of linked list.
 } cl_block_list;

 // Cacheline aligned block pointers to avoid false sharing. Block pointer must
 // be initialized via set_block(), before calling other functions
 typedef struct census_log_core_local_block {
   gpr_atm block;
 // Ensure cachline alignment: we assume sizeof(gpr_atm) == 4 or 8
 #if defined(GPR_ARCH_64)
 #define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 8)
 #else
 #if defined(GPR_ARCH_32)
 #define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 4)
 #else
 #error "Unknown architecture"
 #endif
 #endif
 #if CL_CORE_LOCAL_BLOCK_PAD_SIZE > 0
   char padding[CL_CORE_LOCAL_BLOCK_PAD_SIZE];
 #endif
 } cl_core_local_block;

 struct census_log {
   int discard_old_records;
   // Number of cores (aka hardware-contexts)
   unsigned num_cores;
   // number of CENSUS_LOG_2_MAX_RECORD_SIZE blocks in log
   uint32_t num_blocks;
   cl_block* blocks;                        // Block metadata.
   cl_core_local_block* core_local_blocks;  // Keeps core to block mappings.
   gpr_mu lock;
   int initialized;  // has log been initialized?
   // Keeps the state of the reader iterator. A value of 0 indicates that
   // iterator has reached the end. census_log_init_reader() resets the value
   // to num_core to restart iteration.
   uint32_t read_iterator_state;
   // Points to the block being read. If non-NULL, the block is locked for
   // reading(block_being_read_->reader_lock is held).
   cl_block* block_being_read;
   char* buffer;
   cl_block_list free_block_list;
   cl_block_list dirty_block_list;
   gpr_atm out_of_space_count;
 };

 // Single internal log.
 static struct census_log g_log;

 // Functions that operate on an atomic memory location used as a lock.

 // Returns non-zero if lock is acquired.
 static int cl_try_lock(gpr_atm* lock) { return gpr_atm_acq_cas(lock, 0, 1); }

 static void cl_unlock(gpr_atm* lock) { gpr_atm_rel_store(lock, 0); }

 // Functions that operate on cl_core_local_block's.

 static void cl_core_local_block_set_block(cl_core_local_block* clb,
                                           cl_block* block) {
   gpr_atm_rel_store(&clb->block, (gpr_atm)block);
 }

 static cl_block* cl_core_local_block_get_block(cl_core_local_block* clb) {
   return (cl_block*)gpr_atm_acq_load(&clb->block);
 }

 // Functions that operate on cl_block_list_struct's.

 static void cl_block_list_struct_initialize(cl_block_list_struct* bls,
                                             cl_block* block) {
   bls->next = bls->prev = bls;
   bls->block = block;
 }

 // Functions that operate on cl_block_list's.

 static void cl_block_list_initialize(cl_block_list* list) {
   list->count = 0;
   cl_block_list_struct_initialize(&list->ht, NULL);
 }

 // Returns head of *this, or NULL if empty.
 static cl_block* cl_block_list_head(cl_block_list* list) {
   return list->ht.next->block;
 }

 // Insert element *e after *pos.
 static void cl_block_list_insert(cl_block_list* list, cl_block_list_struct* pos,
                                  cl_block_list_struct* e) {
   list->count++;
   e->next = pos->next;
   e->prev = pos;
   e->next->prev = e;
   e->prev->next = e;
 }

 // Insert block at the head of the list
 static void cl_block_list_insert_at_head(cl_block_list* list, cl_block* block) {
   cl_block_list_insert(list, &list->ht, &block->link);
 }

 // Insert block at the tail of the list.
 static void cl_block_list_insert_at_tail(cl_block_list* list, cl_block* block) {
   cl_block_list_insert(list, list->ht.prev, &block->link);
 }

 // Removes block *b. Requires *b be in the list.
 static void cl_block_list_remove(cl_block_list* list, cl_block* b) {
   list->count--;
   b->link.next->prev = b->link.prev;
   b->link.prev->next = b->link.next;
 }

 // Functions that operate on cl_block's

 static void cl_block_initialize(cl_block* block, char* buffer) {
   block->buffer = buffer;
   gpr_atm_rel_store(&block->writer_lock, 0);
   gpr_atm_rel_store(&block->reader_lock, 0);
   gpr_atm_rel_store(&block->bytes_committed, 0);
   block->bytes_read = 0;
   cl_block_list_struct_initialize(&block->link, block);
 }

 // Guards against exposing partially written buffer to the reader.
 static void cl_block_set_bytes_committed(cl_block* block,
                                          size_t bytes_committed) {
   gpr_atm_rel_store(&block->bytes_committed, (gpr_atm)bytes_committed);
 }

 static size_t cl_block_get_bytes_committed(cl_block* block) {
   return (size_t)gpr_atm_acq_load(&block->bytes_committed);
 }

 // Tries to disable future read/write access to this block. Succeeds if:
 // - no in-progress write AND
 // - no in-progress read AND
 // - 'discard_data' set to true OR no unread data
 // On success, clears the block state and returns with writer_lock_ and
 // reader_lock_ held. These locks are released by a subsequent
 // cl_block_access_enable() call.
 static bool cl_block_try_disable_access(cl_block* block, int discard_data) {
   if (!cl_try_lock(&block->writer_lock)) {
     return false;
   }
   if (!cl_try_lock(&block->reader_lock)) {
     cl_unlock(&block->writer_lock);
     return false;
   }
   if (!discard_data &&
       (block->bytes_read != cl_block_get_bytes_committed(block))) {
     cl_unlock(&block->reader_lock);
     cl_unlock(&block->writer_lock);
     return false;
   }
   cl_block_set_bytes_committed(block, 0);
   block->bytes_read = 0;
   return true;
 }

 static void cl_block_enable_access(cl_block* block) {
   cl_unlock(&block->reader_lock);
   cl_unlock(&block->writer_lock);
 }

 // Returns with writer_lock held.
 static void* cl_block_start_write(cl_block* block, size_t size) {
   if (!cl_try_lock(&block->writer_lock)) {
     return NULL;
   }
   size_t bytes_committed = cl_block_get_bytes_committed(block);
   if (bytes_committed + size > CENSUS_LOG_MAX_RECORD_SIZE) {
     cl_unlock(&block->writer_lock);
     return NULL;
   }
   return block->buffer + bytes_committed;
 }

 // Releases writer_lock and increments committed bytes by 'bytes_written'.
 // 'bytes_written' must be <= 'size' specified in the corresponding
 // StartWrite() call. This function is thread-safe.
 static void cl_block_end_write(cl_block* block, size_t bytes_written) {
   cl_block_set_bytes_committed(
       block, cl_block_get_bytes_committed(block) + bytes_written);
   cl_unlock(&block->writer_lock);
 }

 // Returns a pointer to the first unread byte in buffer. The number of bytes
 // available are returned in 'bytes_available'. Acquires reader lock that is
 // released by a subsequent cl_block_end_read() call. Returns NULL if:
 // - read in progress
 // - no data available
 static void* cl_block_start_read(cl_block* block, size_t* bytes_available) {
   if (!cl_try_lock(&block->reader_lock)) {
     return NULL;
   }
   // bytes_committed may change from under us. Use bytes_available to update
   // bytes_read below.
   size_t bytes_committed = cl_block_get_bytes_committed(block);
   GPR_ASSERT(bytes_committed >= block->bytes_read);
   *bytes_available = bytes_committed - block->bytes_read;
   if (*bytes_available == 0) {
     cl_unlock(&block->reader_lock);
     return NULL;
   }
   void* record = block->buffer + block->bytes_read;
   block->bytes_read += *bytes_available;
   return record;
 }

 static void cl_block_end_read(cl_block* block) {
   cl_unlock(&block->reader_lock);
 }

 // Internal functions operating on g_log

 // Allocates a new free block (or recycles an available dirty block if log is
 // configured to discard old records). Returns NULL if out-of-space.
 static cl_block* cl_allocate_block(void) {
   cl_block* block = cl_block_list_head(&g_log.free_block_list);
   if (block != NULL) {
     cl_block_list_remove(&g_log.free_block_list, block);
     return block;
   }
   if (!g_log.discard_old_records) {
     // No free block and log is configured to keep old records.
     return NULL;
   }
   // Recycle dirty block. Start from the oldest.
   for (block = cl_block_list_head(&g_log.dirty_block_list); block != NULL;
        block = block->link.next->block) {
     if (cl_block_try_disable_access(block, 1 /* discard data */)) {
       cl_block_list_remove(&g_log.dirty_block_list, block);
       return block;
     }
   }
   return NULL;
 }

 // Allocates a new block and updates core id => block mapping. 'old_block'
 // points to the block that the caller thinks is attached to
 // 'core_id'. 'old_block' may be NULL. Returns true if:
 // - allocated a new block OR
 // - 'core_id' => 'old_block' mapping changed (another thread allocated a
 //   block before lock was acquired).
 static bool cl_allocate_core_local_block(uint32_t core_id,
                                          cl_block* old_block) {
   // Now that we have the lock, check if core-local mapping has changed.
   cl_core_local_block* core_local_block = &g_log.core_local_blocks[core_id];
   cl_block* block = cl_core_local_block_get_block(core_local_block);
   if ((block != NULL) && (block != old_block)) {
     return true;
   }
   if (block != NULL) {
     cl_core_local_block_set_block(core_local_block, NULL);
     cl_block_list_insert_at_tail(&g_log.dirty_block_list, block);
   }
   block = cl_allocate_block();
   if (block == NULL) {
     return false;
   }
   cl_core_local_block_set_block(core_local_block, block);
   cl_block_enable_access(block);
   return true;
 }

 static cl_block* cl_get_block(void* record) {
   uintptr_t p = (uintptr_t)((char*)record - g_log.buffer);
   uintptr_t index = p >> CENSUS_LOG_2_MAX_RECORD_SIZE;
   return &g_log.blocks[index];
 }

 // Gets the next block to read and tries to free 'prev' block (if not NULL).
 // Returns NULL if reached the end.
 static cl_block* cl_next_block_to_read(cl_block* prev) {
   cl_block* block = NULL;
   if (g_log.read_iterator_state == g_log.num_cores) {
     // We are traversing dirty list; find the next dirty block.
     if (prev != NULL) {
       // Try to free the previous block if there is no unread data. This
       // block
       // may have unread data if previously incomplete record completed
       // between
       // read_next() calls.
       block = prev->link.next->block;
       if (cl_block_try_disable_access(prev, 0 /* do not discard data */)) {
         cl_block_list_remove(&g_log.dirty_block_list, prev);
         cl_block_list_insert_at_head(&g_log.free_block_list, prev);
       }
     } else {
       block = cl_block_list_head(&g_log.dirty_block_list);
     }
     if (block != NULL) {
       return block;
     }
     // We are done with the dirty list; moving on to core-local blocks.
   }
   while (g_log.read_iterator_state > 0) {
     g_log.read_iterator_state--;
     block = cl_core_local_block_get_block(
         &g_log.core_local_blocks[g_log.read_iterator_state]);
     if (block != NULL) {
       return block;
     }
   }
   return NULL;
 }

 #define CL_LOG_2_MB 20  // 2^20 = 1MB

 // External functions: primary stats_log interface
 void census_log_initialize(size_t size_in_mb, int discard_old_records) {
   // Check cacheline alignment.
   GPR_ASSERT(sizeof(cl_block) % GPR_CACHELINE_SIZE == 0);
   GPR_ASSERT(sizeof(cl_core_local_block) % GPR_CACHELINE_SIZE == 0);
   GPR_ASSERT(!g_log.initialized);
   g_log.discard_old_records = discard_old_records;
   g_log.num_cores = gpr_cpu_num_cores();
   // Ensure that we will not get any overflow in calaculating num_blocks
   GPR_ASSERT(CL_LOG_2_MB >= CENSUS_LOG_2_MAX_RECORD_SIZE);
   GPR_ASSERT(size_in_mb < 1000);
   // Ensure at least 2x as many blocks as there are cores.
   g_log.num_blocks =
       (uint32_t)GPR_MAX(2 * g_log.num_cores, (size_in_mb << CL_LOG_2_MB) >>
                                                  CENSUS_LOG_2_MAX_RECORD_SIZE);
   gpr_mu_init(&g_log.lock);
   g_log.read_iterator_state = 0;
   g_log.block_being_read = NULL;
   g_log.core_local_blocks = (cl_core_local_block*)gpr_malloc_aligned(
       g_log.num_cores * sizeof(cl_core_local_block), GPR_CACHELINE_SIZE_LOG);
   memset(g_log.core_local_blocks, 0,
          g_log.num_cores * sizeof(cl_core_local_block));
   g_log.blocks = (cl_block*)gpr_malloc_aligned(
       g_log.num_blocks * sizeof(cl_block), GPR_CACHELINE_SIZE_LOG);
   memset(g_log.blocks, 0, g_log.num_blocks * sizeof(cl_block));
   g_log.buffer =
       (char*)gpr_malloc(g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
   memset(g_log.buffer, 0, g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
   cl_block_list_initialize(&g_log.free_block_list);
   cl_block_list_initialize(&g_log.dirty_block_list);
   for (uint32_t i = 0; i < g_log.num_blocks; ++i) {
     cl_block* block = g_log.blocks + i;
     cl_block_initialize(block, g_log.buffer + (CENSUS_LOG_MAX_RECORD_SIZE * i));
     cl_block_try_disable_access(block, 1 /* discard data */);
     cl_block_list_insert_at_tail(&g_log.free_block_list, block);
   }
   gpr_atm_rel_store(&g_log.out_of_space_count, 0);
   g_log.initialized = 1;
 }

 void census_log_shutdown(void) {
   GPR_ASSERT(g_log.initialized);
   gpr_mu_destroy(&g_log.lock);
   gpr_free_aligned(g_log.core_local_blocks);
   g_log.core_local_blocks = NULL;
   gpr_free_aligned(g_log.blocks);
   g_log.blocks = NULL;
   gpr_free(g_log.buffer);
   g_log.buffer = NULL;
   g_log.initialized = 0;
 }

 void* census_log_start_write(size_t size) {
   // Used to bound number of times block allocation is attempted.
   GPR_ASSERT(size > 0);
   GPR_ASSERT(g_log.initialized);
   if (size > CENSUS_LOG_MAX_RECORD_SIZE) {
     return NULL;
   }
   uint32_t attempts_remaining = g_log.num_blocks;
   uint32_t core_id = gpr_cpu_current_cpu();
   do {
     void* record = NULL;
     cl_block* block =
         cl_core_local_block_get_block(&g_log.core_local_blocks[core_id]);
     if (block && (record = cl_block_start_write(block, size))) {
       return record;
     }
     // Need to allocate a new block. We are here if:
     // - No block associated with the core OR
     // - Write in-progress on the block OR
     // - block is out of space
     gpr_mu_lock(&g_log.lock);
     bool allocated = cl_allocate_core_local_block(core_id, block);
     gpr_mu_unlock(&g_log.lock);
     if (!allocated) {
       gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
       return NULL;
     }
   } while (attempts_remaining--);
   // Give up.
   gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
   return NULL;
 }

 void census_log_end_write(void* record, size_t bytes_written) {
   GPR_ASSERT(g_log.initialized);
   cl_block_end_write(cl_get_block(record), bytes_written);
 }

 void census_log_init_reader(void) {
   GPR_ASSERT(g_log.initialized);
   gpr_mu_lock(&g_log.lock);
   // If a block is locked for reading unlock it.
   if (g_log.block_being_read != NULL) {
     cl_block_end_read(g_log.block_being_read);
     g_log.block_being_read = NULL;
   }
   g_log.read_iterator_state = g_log.num_cores;
   gpr_mu_unlock(&g_log.lock);
 }

 const void* census_log_read_next(size_t* bytes_available) {
   GPR_ASSERT(g_log.initialized);
   gpr_mu_lock(&g_log.lock);
   if (g_log.block_being_read != NULL) {
     cl_block_end_read(g_log.block_being_read);
   }
   do {
     g_log.block_being_read = cl_next_block_to_read(g_log.block_being_read);
     if (g_log.block_being_read != NULL) {
       void* record =
           cl_block_start_read(g_log.block_being_read, bytes_available);
       if (record != NULL) {
         gpr_mu_unlock(&g_log.lock);
         return record;
       }
     }
   } while (g_log.block_being_read != NULL);
   gpr_mu_unlock(&g_log.lock);
   return NULL;
 }

 size_t census_log_remaining_space(void) {
   GPR_ASSERT(g_log.initialized);
   size_t space = 0;
   gpr_mu_lock(&g_log.lock);
   if (g_log.discard_old_records) {
     // Remaining space is not meaningful; just return the entire log space.
     space = g_log.num_blocks << CENSUS_LOG_2_MAX_RECORD_SIZE;
   } else {
     GPR_ASSERT(g_log.free_block_list.count >= 0);
     space = (size_t)g_log.free_block_list.count * CENSUS_LOG_MAX_RECORD_SIZE;
   }
   gpr_mu_unlock(&g_log.lock);
   return space;
 }

 int64_t census_log_out_of_space_count(void) {
   GPR_ASSERT(g_log.initialized);
   return gpr_atm_acq_load(&g_log.out_of_space_count);
 }
	/*
	*
	* Copyright 2015 gRPC authors.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*/

	// Implements an efficient in-memory log, optimized for multiple writers and
	// a single reader. Available log space is divided up in blocks of
	// CENSUS_LOG_2_MAX_RECORD_SIZE bytes. A block can be in one of the following
	// three data structures:
	// - Free blocks (free_block_list)
	// - Blocks with unread data (dirty_block_list)
	// - Blocks currently attached to cores (core_local_blocks[])
	//
	// census_log_start_write() moves a block from core_local_blocks[] to the end of
	// dirty_block_list when block:
	// - is out-of-space OR
	// - has an incomplete record (an incomplete record occurs when a thread calls
	// census_log_start_write() and is context-switched before calling
	// census_log_end_write()
	// So, blocks in dirty_block_list are ordered, from oldest to newest, by the
	// time when block is detached from the core.
	//
	// census_log_read_next() first iterates over dirty_block_list and then
	// core_local_blocks[]. It moves completely read blocks from dirty_block_list
	// to free_block_list. Blocks in core_local_blocks[] are not freed, even when
	// completely read.
	//
	// If the log is configured to discard old records and free_block_list is empty,
	// census_log_start_write() iterates over dirty_block_list to allocate a
	// new block. It moves the oldest available block (no pending read/write) to
	// core_local_blocks[].
	//
	// core_local_block_struct is used to implement a map from core id to the block
	// associated with that core. This mapping is advisory. It is possible that the
	// block returned by this mapping is no longer associated with that core. This
	// mapping is updated, lazily, by census_log_start_write().
	//
	// Locking in block struct:
	//
	// Exclusive g_log.lock must be held before calling any functions operating on
	// block structs except census_log_start_write() and census_log_end_write().
	//
	// Writes to a block are serialized via writer_lock. census_log_start_write()
	// acquires this lock and census_log_end_write() releases it. On failure to
	// acquire the lock, writer allocates a new block for the current core and
	// updates core_local_block accordingly.
	//
	// Simultaneous read and write access is allowed. Readers can safely read up to
	// committed bytes (bytes_committed).
	//
	// reader_lock protects the block, currently being read, from getting recycled.
	// start_read() acquires reader_lock and end_read() releases the lock.
	//
	// Read/write access to a block is disabled via try_disable_access(). It returns
	// with both writer_lock and reader_lock held. These locks are subsequently
	// released by enable_access() to enable access to the block.
	//
	// A note on naming: Most function/struct names are prepended by cl_
	// (shorthand for census_log). Further, functions that manipulate structures
	// include the name of the structure, which will be passed as the first
	// argument. E.g. cl_block_initialize() will initialize a cl_block.

	#include "src/core/ext/census/mlog.h"
	#include <grpc/support/alloc.h>
	#include <grpc/support/atm.h>
	#include <grpc/support/cpu.h>
	#include <grpc/support/log.h>
	#include <grpc/support/sync.h>
	#include <grpc/support/useful.h>
	#include <stdbool.h>
	#include <string.h>

	// End of platform specific code

	typedef struct census_log_block_list_struct {
	struct census_log_block_list_struct* next;
	struct census_log_block_list_struct* prev;
	struct census_log_block* block;
	} cl_block_list_struct;

	typedef struct census_log_block {
	// Pointer to underlying buffer.
	char* buffer;
	gpr_atm writer_lock;
	gpr_atm reader_lock;
	// Keeps completely written bytes. Declared atomic because accessed
	// simultaneously by reader and writer.
	gpr_atm bytes_committed;
	// Bytes already read.
	size_t bytes_read;
	// Links for list.
	cl_block_list_struct link;
	// We want this structure to be cacheline aligned. We assume the following
	// sizes for the various parts on 32/64bit systems:
	// type 32b size 64b size
	// char* 4 8
	// 3x gpr_atm 12 24
	// size_t 4 8
	// cl_block_list_struct 12 24
	// TOTAL 32 64
	//
	// Depending on the size of our cacheline and the architecture, we
	// selectively add char buffering to this structure. The size is checked
	// via assert in census_log_initialize().
	#if defined(GPR_ARCH_64)
	#define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 64)
	#else
	#if defined(GPR_ARCH_32)
	#define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 32)
	#else
	#error "Unknown architecture"
	#endif
	#endif
	#if CL_BLOCK_PAD_SIZE > 0
	char padding[CL_BLOCK_PAD_SIZE];
	#endif
	} cl_block;

	// A list of cl_blocks, doubly-linked through cl_block::link.
	typedef struct census_log_block_list {
	int32_t count; // Number of items in list.
	cl_block_list_struct ht; // head/tail of linked list.
	} cl_block_list;

	// Cacheline aligned block pointers to avoid false sharing. Block pointer must
	// be initialized via set_block(), before calling other functions
	typedef struct census_log_core_local_block {
	gpr_atm block;
	// Ensure cachline alignment: we assume sizeof(gpr_atm) == 4 or 8
	#if defined(GPR_ARCH_64)
	#define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 8)
	#else
	#if defined(GPR_ARCH_32)
	#define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 4)
	#else
	#error "Unknown architecture"
	#endif
	#endif
	#if CL_CORE_LOCAL_BLOCK_PAD_SIZE > 0
	char padding[CL_CORE_LOCAL_BLOCK_PAD_SIZE];
	#endif
	} cl_core_local_block;

	struct census_log {
	int discard_old_records;
	// Number of cores (aka hardware-contexts)
	unsigned num_cores;
	// number of CENSUS_LOG_2_MAX_RECORD_SIZE blocks in log
	uint32_t num_blocks;
	cl_block* blocks; // Block metadata.
	cl_core_local_block* core_local_blocks; // Keeps core to block mappings.
	gpr_mu lock;
	int initialized; // has log been initialized?
	// Keeps the state of the reader iterator. A value of 0 indicates that
	// iterator has reached the end. census_log_init_reader() resets the value
	// to num_core to restart iteration.
	uint32_t read_iterator_state;
	// Points to the block being read. If non-NULL, the block is locked for
	// reading(block_being_read_->reader_lock is held).
	cl_block* block_being_read;
	char* buffer;
	cl_block_list free_block_list;
	cl_block_list dirty_block_list;
	gpr_atm out_of_space_count;
	};

	// Single internal log.
	static struct census_log g_log;

	// Functions that operate on an atomic memory location used as a lock.

	// Returns non-zero if lock is acquired.
	static int cl_try_lock(gpr_atm* lock) { return gpr_atm_acq_cas(lock, 0, 1); }

	static void cl_unlock(gpr_atm* lock) { gpr_atm_rel_store(lock, 0); }

	// Functions that operate on cl_core_local_block's.

	static void cl_core_local_block_set_block(cl_core_local_block* clb,
	cl_block* block) {
	gpr_atm_rel_store(&clb->block, (gpr_atm)block);
	}

	static cl_block* cl_core_local_block_get_block(cl_core_local_block* clb) {
	return (cl_block*)gpr_atm_acq_load(&clb->block);
	}

	// Functions that operate on cl_block_list_struct's.

	static void cl_block_list_struct_initialize(cl_block_list_struct* bls,
	cl_block* block) {
	bls->next = bls->prev = bls;
	bls->block = block;
	}

	// Functions that operate on cl_block_list's.

	static void cl_block_list_initialize(cl_block_list* list) {
	list->count = 0;
	cl_block_list_struct_initialize(&list->ht, NULL);
	}

	// Returns head of *this, or NULL if empty.
	static cl_block* cl_block_list_head(cl_block_list* list) {
	return list->ht.next->block;
	}

	// Insert element e after pos.
	static void cl_block_list_insert(cl_block_list* list, cl_block_list_struct* pos,
	cl_block_list_struct* e) {
	list->count++;
	e->next = pos->next;
	e->prev = pos;
	e->next->prev = e;
	e->prev->next = e;
	}

	// Insert block at the head of the list
	static void cl_block_list_insert_at_head(cl_block_list* list, cl_block* block) {
	cl_block_list_insert(list, &list->ht, &block->link);
	}

	// Insert block at the tail of the list.
	static void cl_block_list_insert_at_tail(cl_block_list* list, cl_block* block) {
	cl_block_list_insert(list, list->ht.prev, &block->link);
	}

	// Removes block b. Requires b be in the list.
	static void cl_block_list_remove(cl_block_list* list, cl_block* b) {
	list->count--;
	b->link.next->prev = b->link.prev;
	b->link.prev->next = b->link.next;
	}

	// Functions that operate on cl_block's

	static void cl_block_initialize(cl_block* block, char* buffer) {
	block->buffer = buffer;
	gpr_atm_rel_store(&block->writer_lock, 0);
	gpr_atm_rel_store(&block->reader_lock, 0);
	gpr_atm_rel_store(&block->bytes_committed, 0);
	block->bytes_read = 0;
	cl_block_list_struct_initialize(&block->link, block);
	}

	// Guards against exposing partially written buffer to the reader.
	static void cl_block_set_bytes_committed(cl_block* block,
	size_t bytes_committed) {
	gpr_atm_rel_store(&block->bytes_committed, (gpr_atm)bytes_committed);
	}

	static size_t cl_block_get_bytes_committed(cl_block* block) {
	return (size_t)gpr_atm_acq_load(&block->bytes_committed);
	}

	// Tries to disable future read/write access to this block. Succeeds if:
	// - no in-progress write AND
	// - no in-progress read AND
	// - 'discard_data' set to true OR no unread data
	// On success, clears the block state and returns with writer_lock_ and
	// reader_lock_ held. These locks are released by a subsequent
	// cl_block_access_enable() call.
	static bool cl_block_try_disable_access(cl_block* block, int discard_data) {
	if (!cl_try_lock(&block->writer_lock)) {
	return false;
	}
	if (!cl_try_lock(&block->reader_lock)) {
	cl_unlock(&block->writer_lock);
	return false;
	}
	if (!discard_data &&
	(block->bytes_read != cl_block_get_bytes_committed(block))) {
	cl_unlock(&block->reader_lock);
	cl_unlock(&block->writer_lock);
	return false;
	}
	cl_block_set_bytes_committed(block, 0);
	block->bytes_read = 0;
	return true;
	}

	static void cl_block_enable_access(cl_block* block) {
	cl_unlock(&block->reader_lock);
	cl_unlock(&block->writer_lock);
	}

	// Returns with writer_lock held.
	static void* cl_block_start_write(cl_block* block, size_t size) {
	if (!cl_try_lock(&block->writer_lock)) {
	return NULL;
	}
	size_t bytes_committed = cl_block_get_bytes_committed(block);
	if (bytes_committed + size > CENSUS_LOG_MAX_RECORD_SIZE) {
	cl_unlock(&block->writer_lock);
	return NULL;
	}
	return block->buffer + bytes_committed;
	}

	// Releases writer_lock and increments committed bytes by 'bytes_written'.
	// 'bytes_written' must be <= 'size' specified in the corresponding
	// StartWrite() call. This function is thread-safe.
	static void cl_block_end_write(cl_block* block, size_t bytes_written) {
	cl_block_set_bytes_committed(
	block, cl_block_get_bytes_committed(block) + bytes_written);
	cl_unlock(&block->writer_lock);
	}

	// Returns a pointer to the first unread byte in buffer. The number of bytes
	// available are returned in 'bytes_available'. Acquires reader lock that is
	// released by a subsequent cl_block_end_read() call. Returns NULL if:
	// - read in progress
	// - no data available
	static void* cl_block_start_read(cl_block* block, size_t* bytes_available) {
	if (!cl_try_lock(&block->reader_lock)) {
	return NULL;
	}
	// bytes_committed may change from under us. Use bytes_available to update
	// bytes_read below.
	size_t bytes_committed = cl_block_get_bytes_committed(block);
	GPR_ASSERT(bytes_committed >= block->bytes_read);
	*bytes_available = bytes_committed - block->bytes_read;
	if (*bytes_available == 0) {
	cl_unlock(&block->reader_lock);
	return NULL;
	}
	void* record = block->buffer + block->bytes_read;
	block->bytes_read += *bytes_available;
	return record;
	}

	static void cl_block_end_read(cl_block* block) {
	cl_unlock(&block->reader_lock);
	}

	// Internal functions operating on g_log

	// Allocates a new free block (or recycles an available dirty block if log is
	// configured to discard old records). Returns NULL if out-of-space.
	static cl_block* cl_allocate_block(void) {
	cl_block* block = cl_block_list_head(&g_log.free_block_list);
	if (block != NULL) {
	cl_block_list_remove(&g_log.free_block_list, block);
	return block;
	}
	if (!g_log.discard_old_records) {
	// No free block and log is configured to keep old records.
	return NULL;
	}
	// Recycle dirty block. Start from the oldest.
	for (block = cl_block_list_head(&g_log.dirty_block_list); block != NULL;
	block = block->link.next->block) {
	if (cl_block_try_disable_access(block, 1 /* discard data */)) {
	cl_block_list_remove(&g_log.dirty_block_list, block);
	return block;
	}
	}
	return NULL;
	}

	// Allocates a new block and updates core id => block mapping. 'old_block'
	// points to the block that the caller thinks is attached to
	// 'core_id'. 'old_block' may be NULL. Returns true if:
	// - allocated a new block OR
	// - 'core_id' => 'old_block' mapping changed (another thread allocated a
	// block before lock was acquired).
	static bool cl_allocate_core_local_block(uint32_t core_id,
	cl_block* old_block) {
	// Now that we have the lock, check if core-local mapping has changed.
	cl_core_local_block* core_local_block = &g_log.core_local_blocks[core_id];
	cl_block* block = cl_core_local_block_get_block(core_local_block);
	if ((block != NULL) && (block != old_block)) {
	return true;
	}
	if (block != NULL) {
	cl_core_local_block_set_block(core_local_block, NULL);
	cl_block_list_insert_at_tail(&g_log.dirty_block_list, block);
	}
	block = cl_allocate_block();
	if (block == NULL) {
	return false;
	}
	cl_core_local_block_set_block(core_local_block, block);
	cl_block_enable_access(block);
	return true;
	}

	static cl_block* cl_get_block(void* record) {
	uintptr_t p = (uintptr_t)((char*)record - g_log.buffer);
	uintptr_t index = p >> CENSUS_LOG_2_MAX_RECORD_SIZE;
	return &g_log.blocks[index];
	}

	// Gets the next block to read and tries to free 'prev' block (if not NULL).
	// Returns NULL if reached the end.
	static cl_block* cl_next_block_to_read(cl_block* prev) {
	cl_block* block = NULL;
	if (g_log.read_iterator_state == g_log.num_cores) {
	// We are traversing dirty list; find the next dirty block.
	if (prev != NULL) {
	// Try to free the previous block if there is no unread data. This
	// block
	// may have unread data if previously incomplete record completed
	// between
	// read_next() calls.
	block = prev->link.next->block;
	if (cl_block_try_disable_access(prev, 0 /* do not discard data */)) {
	cl_block_list_remove(&g_log.dirty_block_list, prev);
	cl_block_list_insert_at_head(&g_log.free_block_list, prev);
	}
	} else {
	block = cl_block_list_head(&g_log.dirty_block_list);
	}
	if (block != NULL) {
	return block;
	}
	// We are done with the dirty list; moving on to core-local blocks.
	}
	while (g_log.read_iterator_state > 0) {
	g_log.read_iterator_state--;
	block = cl_core_local_block_get_block(
	&g_log.core_local_blocks[g_log.read_iterator_state]);
	if (block != NULL) {
	return block;
	}
	}
	return NULL;
	}

	#define CL_LOG_2_MB 20 // 2^20 = 1MB

	// External functions: primary stats_log interface
	void census_log_initialize(size_t size_in_mb, int discard_old_records) {
	// Check cacheline alignment.
	GPR_ASSERT(sizeof(cl_block) % GPR_CACHELINE_SIZE == 0);
	GPR_ASSERT(sizeof(cl_core_local_block) % GPR_CACHELINE_SIZE == 0);
	GPR_ASSERT(!g_log.initialized);
	g_log.discard_old_records = discard_old_records;
	g_log.num_cores = gpr_cpu_num_cores();
	// Ensure that we will not get any overflow in calaculating num_blocks
	GPR_ASSERT(CL_LOG_2_MB >= CENSUS_LOG_2_MAX_RECORD_SIZE);
	GPR_ASSERT(size_in_mb < 1000);
	// Ensure at least 2x as many blocks as there are cores.
	g_log.num_blocks =
	(uint32_t)GPR_MAX(2 * g_log.num_cores, (size_in_mb << CL_LOG_2_MB) >>
	CENSUS_LOG_2_MAX_RECORD_SIZE);
	gpr_mu_init(&g_log.lock);
	g_log.read_iterator_state = 0;
	g_log.block_being_read = NULL;
	g_log.core_local_blocks = (cl_core_local_block*)gpr_malloc_aligned(
	g_log.num_cores * sizeof(cl_core_local_block), GPR_CACHELINE_SIZE_LOG);
	memset(g_log.core_local_blocks, 0,
	g_log.num_cores * sizeof(cl_core_local_block));
	g_log.blocks = (cl_block*)gpr_malloc_aligned(
	g_log.num_blocks * sizeof(cl_block), GPR_CACHELINE_SIZE_LOG);
	memset(g_log.blocks, 0, g_log.num_blocks * sizeof(cl_block));
	g_log.buffer =
	(char)gpr_malloc(g_log.num_blocks CENSUS_LOG_MAX_RECORD_SIZE);
	memset(g_log.buffer, 0, g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
	cl_block_list_initialize(&g_log.free_block_list);
	cl_block_list_initialize(&g_log.dirty_block_list);
	for (uint32_t i = 0; i < g_log.num_blocks; ++i) {
	cl_block* block = g_log.blocks + i;
	cl_block_initialize(block, g_log.buffer + (CENSUS_LOG_MAX_RECORD_SIZE * i));
	cl_block_try_disable_access(block, 1 /* discard data */);
	cl_block_list_insert_at_tail(&g_log.free_block_list, block);
	}
	gpr_atm_rel_store(&g_log.out_of_space_count, 0);
	g_log.initialized = 1;
	}

	void census_log_shutdown(void) {
	GPR_ASSERT(g_log.initialized);
	gpr_mu_destroy(&g_log.lock);
	gpr_free_aligned(g_log.core_local_blocks);
	g_log.core_local_blocks = NULL;
	gpr_free_aligned(g_log.blocks);
	g_log.blocks = NULL;
	gpr_free(g_log.buffer);
	g_log.buffer = NULL;
	g_log.initialized = 0;
	}

	void* census_log_start_write(size_t size) {
	// Used to bound number of times block allocation is attempted.
	GPR_ASSERT(size > 0);
	GPR_ASSERT(g_log.initialized);
	if (size > CENSUS_LOG_MAX_RECORD_SIZE) {
	return NULL;
	}
	uint32_t attempts_remaining = g_log.num_blocks;
	uint32_t core_id = gpr_cpu_current_cpu();
	do {
	void* record = NULL;
	cl_block* block =
	cl_core_local_block_get_block(&g_log.core_local_blocks[core_id]);
	if (block && (record = cl_block_start_write(block, size))) {
	return record;
	}
	// Need to allocate a new block. We are here if:
	// - No block associated with the core OR
	// - Write in-progress on the block OR
	// - block is out of space
	gpr_mu_lock(&g_log.lock);
	bool allocated = cl_allocate_core_local_block(core_id, block);
	gpr_mu_unlock(&g_log.lock);
	if (!allocated) {
	gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
	return NULL;
	}
	} while (attempts_remaining--);
	// Give up.
	gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
	return NULL;
	}

	void census_log_end_write(void* record, size_t bytes_written) {
	GPR_ASSERT(g_log.initialized);
	cl_block_end_write(cl_get_block(record), bytes_written);
	}

	void census_log_init_reader(void) {
	GPR_ASSERT(g_log.initialized);
	gpr_mu_lock(&g_log.lock);
	// If a block is locked for reading unlock it.
	if (g_log.block_being_read != NULL) {
	cl_block_end_read(g_log.block_being_read);
	g_log.block_being_read = NULL;
	}
	g_log.read_iterator_state = g_log.num_cores;
	gpr_mu_unlock(&g_log.lock);
	}

	const void* census_log_read_next(size_t* bytes_available) {
	GPR_ASSERT(g_log.initialized);
	gpr_mu_lock(&g_log.lock);
	if (g_log.block_being_read != NULL) {
	cl_block_end_read(g_log.block_being_read);
	}
	do {
	g_log.block_being_read = cl_next_block_to_read(g_log.block_being_read);
	if (g_log.block_being_read != NULL) {
	void* record =
	cl_block_start_read(g_log.block_being_read, bytes_available);
	if (record != NULL) {
	gpr_mu_unlock(&g_log.lock);
	return record;
	}
	}
	} while (g_log.block_being_read != NULL);
	gpr_mu_unlock(&g_log.lock);
	return NULL;
	}

	size_t census_log_remaining_space(void) {
	GPR_ASSERT(g_log.initialized);
	size_t space = 0;
	gpr_mu_lock(&g_log.lock);
	if (g_log.discard_old_records) {
	// Remaining space is not meaningful; just return the entire log space.
	space = g_log.num_blocks << CENSUS_LOG_2_MAX_RECORD_SIZE;
	} else {
	GPR_ASSERT(g_log.free_block_list.count >= 0);
	space = (size_t)g_log.free_block_list.count * CENSUS_LOG_MAX_RECORD_SIZE;
	}
	gpr_mu_unlock(&g_log.lock);
	return space;
	}

	int64_t census_log_out_of_space_count(void) {
	GPR_ASSERT(g_log.initialized);
	return gpr_atm_acq_load(&g_log.out_of_space_count);
	}