include/jemalloc/internal/prof.h - platform/external/jemalloc - Gitiles

 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
 typedef struct prof_tdata_s prof_tdata_t;

 /* Option defaults. */
 #define	PROF_PREFIX_DEFAULT		"jeprof"
 #define	LG_PROF_SAMPLE_DEFAULT		19
 #define	LG_PROF_INTERVAL_DEFAULT	-1

 /*
  * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
  * is based on __builtin_return_address() necessarily has a hard-coded number
  * of backtrace frame handlers, and should be kept in sync with this setting.
  */
 #define	PROF_BT_MAX			128

 /* Maximum number of backtraces to store in each per thread LRU cache. */
 #define	PROF_TCMAX			1024

 /* Initial hash table size. */
 #define	PROF_CKH_MINITEMS		64

 /* Size of memory buffer to use when writing dump files. */
 #define	PROF_DUMP_BUFSIZE		65536

 /* Size of stack-allocated buffer used by prof_printf(). */
 #define	PROF_PRINTF_BUFSIZE		128

 /*
  * Number of mutexes shared among all ctx's.  No space is allocated for these
  * unless profiling is enabled, so it's okay to over-provision.
  */
 #define	PROF_NCTX_LOCKS			1024

 /*
  * prof_tdata pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
  */
 #define	PROF_TDATA_STATE_REINCARNATED	((prof_tdata_t *)(uintptr_t)1)
 #define	PROF_TDATA_STATE_PURGATORY	((prof_tdata_t *)(uintptr_t)2)
 #define	PROF_TDATA_STATE_MAX		PROF_TDATA_STATE_PURGATORY

 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS

 struct prof_bt_s {
 	/* Backtrace, stored as len program counters. */
 	void		**vec;
 	unsigned	len;
 };

 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
 	prof_bt_t	*bt;
 	unsigned	nignore;
 	unsigned	max;
 } prof_unwind_data_t;
 #endif

 struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
 	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
 	 * overflow/underflow, but a general solution would require something
 	 * like 128-bit counters; this implementation doesn't bother to solve
 	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };

 struct prof_thr_cnt_s {
 	/* Linkage into prof_ctx_t's cnts_ql. */
 	ql_elm(prof_thr_cnt_t)	cnts_link;

 	/* Linkage into thread's LRU. */
 	ql_elm(prof_thr_cnt_t)	lru_link;

 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
 	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;

 	/*
 	 * Threads use memory barriers to update the counters.  Since there is
 	 * only ever one writer, the only challenge is for the reader to get a
 	 * consistent read of the counters.
 	 *
 	 * The writer uses this series of operations:
 	 *
 	 * 1) Increment epoch to an odd number.
 	 * 2) Update counters.
 	 * 3) Increment epoch to an even number.
 	 *
 	 * The reader must assure 1) that the epoch is even while it reads the
 	 * counters, and 2) that the epoch doesn't change between the time it
 	 * starts and finishes reading the counters.
 	 */
 	unsigned		epoch;

 	/* Profiling counters. */
 	prof_cnt_t		cnts;
 };

 struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;

 	/* Protects nlimbo, cnt_merged, and cnts_ql. */
 	malloc_mutex_t		*lock;

 	/*
 	 * Number of threads that currently cause this ctx to be in a state of
 	 * limbo due to one of:
 	 *   - Initializing per thread counters associated with this ctx.
 	 *   - Preparing to destroy this ctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
 	 * ctx.
 	 */
 	unsigned		nlimbo;

 	/* Temporary storage for summation during dump. */
 	prof_cnt_t		cnt_summed;

 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;

 	/*
 	 * List of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };

 struct prof_tdata_s {
 	/*
 	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
 	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
 	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
 	 * others will ever write them.
 	 *
 	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
 	 * counter data into the associated prof_ctx_t objects, and unlink/free
 	 * the prof_thr_cnt_t objects.
 	 */
 	ckh_t			bt2cnt;

 	/* LRU for contents of bt2cnt. */
 	ql_head(prof_thr_cnt_t)	lru_ql;

 	/* Backtrace vector, used for calls to prof_backtrace(). */
 	void			**vec;

 	/* Sampling state. */
 	uint64_t		prng_state;
 	uint64_t		threshold;
 	uint64_t		accum;

 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
 	bool			enq_idump;
 	bool			enq_gdump;
 };

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

 extern bool	opt_prof;
 /*
  * Even if opt_prof is true, sampling can be temporarily disabled by setting
  * opt_prof_active to false.  No locking is used when updating opt_prof_active,
  * so there are no guarantees regarding how long it will take for all threads
  * to notice state changes.
  */
 extern bool	opt_prof_active;
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_final;       /* Final profile dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
 extern char	opt_prof_prefix[PATH_MAX + 1];

 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
  * profile dump when it reaches this threshold.  The effect is that the
  * interval between profile dumps averages prof_interval, though the actual
  * interval between dumps will tend to be sporadic, and the interval will be a
  * maximum of approximately (prof_interval * narenas).
  */
 extern uint64_t	prof_interval;

 /*
  * If true, promote small sampled objects to large objects, since small run
  * headers do not have embedded profile context pointers.
  */
 extern bool	prof_promote;

 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
 void	prof_tdata_cleanup(void *arg);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
 	prof_tdata_t *prof_tdata;					\
 	prof_bt_t bt;							\
 									\
 	assert(size == s2u(size));					\
 									\
 	prof_tdata = prof_tdata_get(true);				\
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
 		if (prof_tdata != NULL)					\
 			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
 		else							\
 			ret = NULL;					\
 		break;							\
 	}								\
 									\
 	if (opt_prof_active == false) {					\
 		/* Sampling is currently inactive, so avoid sampling. */\
 		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
 	} else if (opt_lg_prof_sample == 0) {				\
 		/* Don't bother with sampling logic, since sampling   */\
 		/* interval is 1.                                     */\
 		bt_init(&bt, prof_tdata->vec);				\
 		prof_backtrace(&bt, nignore);				\
 		ret = prof_lookup(&bt);					\
 	} else {							\
 		if (prof_tdata->threshold == 0) {			\
 			/* Initialize.  Seed the prng differently for */\
 			/* each thread.                               */\
 			prof_tdata->prng_state =			\
 			    (uint64_t)(uintptr_t)&size;			\
 			prof_sample_threshold_update(prof_tdata);	\
 		}							\
 									\
 		/* Determine whether to capture a backtrace based on  */\
 		/* whether size is enough for prof_accum to reach     */\
 		/* prof_tdata->threshold.  However, delay updating    */\
 		/* these variables until prof_{m,re}alloc(), because  */\
 		/* we don't know for sure that the allocation will    */\
 		/* succeed.                                           */\
 		/*                                                    */\
 		/* Use subtraction rather than addition to avoid      */\
 		/* potential integer overflow.                        */\
 		if (size >= prof_tdata->threshold -			\
 		    prof_tdata->accum) {				\
 			bt_init(&bt, prof_tdata->vec);			\
 			prof_backtrace(&bt, nignore);			\
 			ret = prof_lookup(&bt);				\
 		} else							\
 			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
 	}								\
 } while (0)

 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)

 prof_tdata_t	*prof_tdata_get(bool create);
 void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
 void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
     size_t old_size, prof_ctx_t *old_ctx);
 void	prof_free(const void *ptr, size_t size);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
     prof_tdata_cleanup)

 JEMALLOC_INLINE prof_tdata_t *
 prof_tdata_get(bool create)
 {
 	prof_tdata_t *prof_tdata;

 	cassert(config_prof);

 	prof_tdata = *prof_tdata_tsd_get();
 	if (create && prof_tdata == NULL)
 		prof_tdata = prof_tdata_init();

 	return (prof_tdata);
 }

 JEMALLOC_INLINE void
 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 {
 	/*
 	 * The body of this function is compiled out unless heap profiling is
 	 * enabled, so that it is possible to compile jemalloc with floating
 	 * point support completely disabled.  Avoiding floating point code is
 	 * important on memory-constrained systems, but it also enables a
 	 * workaround for versions of glibc that don't properly save/restore
 	 * floating point registers during dynamic lazy symbol loading (which
 	 * internally calls into whatever malloc implementation happens to be
 	 * integrated into the application).  Note that some compilers (e.g.
 	 * gcc 4.8) may use floating point registers for fast memory moves, so
 	 * jemalloc must be compiled with such optimizations disabled (e.g.
 	 * -mno-sse) in order for the workaround to be complete.
 	 */
 #ifdef JEMALLOC_PROF
 	uint64_t r;
 	double u;

 	cassert(config_prof);

 	/*
 	 * Compute sample threshold as a geometrically distributed random
 	 * variable with mean (2^opt_lg_prof_sample).
 	 *
 	 *                         __        __
 	 *                         |  log(u)  |                     1
 	 * prof_tdata->threshold = | -------- |, where p = -------------------
 	 *                         | log(1-p) |             opt_lg_prof_sample
 	 *                                                 2
 	 *
 	 * For more information on the math, see:
 	 *
 	 *   Non-Uniform Random Variate Generation
 	 *   Luc Devroye
 	 *   Springer-Verlag, New York, 1986
 	 *   pp 500
 	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 	 */
 	prng64(r, 53, prof_tdata->prng_state,
 	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
 	u = (double)r * (1.0/9007199254740992.0L);
 	prof_tdata->threshold = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 	    + (uint64_t)1U;
 #endif
 }

 JEMALLOC_INLINE prof_ctx_t *
 prof_ctx_get(const void *ptr)
 {
 	prof_ctx_t *ret;
 	arena_chunk_t *chunk;

 	cassert(config_prof);
 	assert(ptr != NULL);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
 		ret = arena_prof_ctx_get(ptr);
 	} else
 		ret = huge_prof_ctx_get(ptr);

 	return (ret);
 }

 JEMALLOC_INLINE void
 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;

 	cassert(config_prof);
 	assert(ptr != NULL);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
 		arena_prof_ctx_set(ptr, ctx);
 	} else
 		huge_prof_ctx_set(ptr, ctx);
 }

 JEMALLOC_INLINE bool
 prof_sample_accum_update(size_t size)
 {
 	prof_tdata_t *prof_tdata;

 	cassert(config_prof);
 	/* Sampling logic is unnecessary if the interval is 1. */
 	assert(opt_lg_prof_sample != 0);

 	prof_tdata = prof_tdata_get(false);
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);

 	/* Take care to avoid integer overflow. */
 	if (size >= prof_tdata->threshold - prof_tdata->accum) {
 		prof_tdata->accum -= (prof_tdata->threshold - size);
 		/* Compute new sample threshold. */
 		prof_sample_threshold_update(prof_tdata);
 		while (prof_tdata->accum >= prof_tdata->threshold) {
 			prof_tdata->accum -= prof_tdata->threshold;
 			prof_sample_threshold_update(prof_tdata);
 		}
 		return (false);
 	} else {
 		prof_tdata->accum += size;
 		return (true);
 	}
 }

 JEMALLOC_INLINE void
 prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 {

 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(size == isalloc(ptr, true));

 	if (opt_lg_prof_sample != 0) {
 		if (prof_sample_accum_update(size)) {
 			/*
 			 * Don't sample.  For malloc()-like allocation, it is
 			 * always possible to tell in advance how large an
 			 * object's usable size will be, so there should never
 			 * be a difference between the size passed to
 			 * PROF_ALLOC_PREP() and prof_malloc().
 			 */
 			assert((uintptr_t)cnt == (uintptr_t)1U);
 		}
 	}

 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		prof_ctx_set(ptr, cnt->ctx);

 		cnt->epoch++;
 		/*********/
 		mb_write();
 		/*********/
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
 		if (opt_prof_accum) {
 			cnt->cnts.accumobjs++;
 			cnt->cnts.accumbytes += size;
 		}
 		/*********/
 		mb_write();
 		/*********/
 		cnt->epoch++;
 		/*********/
 		mb_write();
 		/*********/
 	} else
 		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 }

 JEMALLOC_INLINE void
 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
     size_t old_size, prof_ctx_t *old_ctx)
 {
 	prof_thr_cnt_t *told_cnt;

 	cassert(config_prof);
 	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);

 	if (ptr != NULL) {
 		assert(size == isalloc(ptr, true));
 		if (opt_lg_prof_sample != 0) {
 			if (prof_sample_accum_update(size)) {
 				/*
 				 * Don't sample.  The size passed to
 				 * PROF_ALLOC_PREP() was larger than what
 				 * actually got allocated, so a backtrace was
 				 * captured for this allocation, even though
 				 * its actual size was insufficient to cross
 				 * the sample threshold.
 				 */
 				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 			}
 		}
 	}

 	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
 		told_cnt = prof_lookup(old_ctx->bt);
 		if (told_cnt == NULL) {
 			/*
 			 * It's too late to propagate OOM for this realloc(),
 			 * so operate directly on old_cnt->ctx->cnt_merged.
 			 */
 			malloc_mutex_lock(old_ctx->lock);
 			old_ctx->cnt_merged.curobjs--;
 			old_ctx->cnt_merged.curbytes -= old_size;
 			malloc_mutex_unlock(old_ctx->lock);
 			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 		}
 	} else
 		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;

 	if ((uintptr_t)told_cnt > (uintptr_t)1U)
 		told_cnt->epoch++;
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		prof_ctx_set(ptr, cnt->ctx);
 		cnt->epoch++;
 	} else if (ptr != NULL)
 		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 	/*********/
 	mb_write();
 	/*********/
 	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
 		told_cnt->cnts.curobjs--;
 		told_cnt->cnts.curbytes -= old_size;
 	}
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		cnt->cnts.curobjs++;
 		cnt->cnts.curbytes += size;
 		if (opt_prof_accum) {
 			cnt->cnts.accumobjs++;
 			cnt->cnts.accumbytes += size;
 		}
 	}
 	/*********/
 	mb_write();
 	/*********/
 	if ((uintptr_t)told_cnt > (uintptr_t)1U)
 		told_cnt->epoch++;
 	if ((uintptr_t)cnt > (uintptr_t)1U)
 		cnt->epoch++;
 	/*********/
 	mb_write(); /* Not strictly necessary. */
 }

 JEMALLOC_INLINE void
 prof_free(const void *ptr, size_t size)
 {
 	prof_ctx_t *ctx = prof_ctx_get(ptr);

 	cassert(config_prof);

 	if ((uintptr_t)ctx > (uintptr_t)1) {
 		prof_thr_cnt_t *tcnt;
 		assert(size == isalloc(ptr, true));
 		tcnt = prof_lookup(ctx->bt);

 		if (tcnt != NULL) {
 			tcnt->epoch++;
 			/*********/
 			mb_write();
 			/*********/
 			tcnt->cnts.curobjs--;
 			tcnt->cnts.curbytes -= size;
 			/*********/
 			mb_write();
 			/*********/
 			tcnt->epoch++;
 			/*********/
 			mb_write();
 			/*********/
 		} else {
 			/*
 			 * OOM during free() cannot be propagated, so operate
 			 * directly on cnt->ctx->cnt_merged.
 			 */
 			malloc_mutex_lock(ctx->lock);
 			ctx->cnt_merged.curobjs--;
 			ctx->cnt_merged.curbytes -= size;
 			malloc_mutex_unlock(ctx->lock);
 		}
 	}
 }
 #endif

 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
	/******************************************************************************/
	#ifdef JEMALLOC_H_TYPES

	typedef struct prof_bt_s prof_bt_t;
	typedef struct prof_cnt_s prof_cnt_t;
	typedef struct prof_thr_cnt_s prof_thr_cnt_t;
	typedef struct prof_ctx_s prof_ctx_t;
	typedef struct prof_tdata_s prof_tdata_t;

	/* Option defaults. */
	#define PROF_PREFIX_DEFAULT "jeprof"
	#define LG_PROF_SAMPLE_DEFAULT 19
	#define LG_PROF_INTERVAL_DEFAULT -1

	/*
	* Hard limit on stack backtrace depth. The version of prof_backtrace() that
	* is based on __builtin_return_address() necessarily has a hard-coded number
	* of backtrace frame handlers, and should be kept in sync with this setting.
	*/
	#define PROF_BT_MAX 128

	/* Maximum number of backtraces to store in each per thread LRU cache. */
	#define PROF_TCMAX 1024

	/* Initial hash table size. */
	#define PROF_CKH_MINITEMS 64

	/* Size of memory buffer to use when writing dump files. */
	#define PROF_DUMP_BUFSIZE 65536

	/* Size of stack-allocated buffer used by prof_printf(). */
	#define PROF_PRINTF_BUFSIZE 128

	/*
	* Number of mutexes shared among all ctx's. No space is allocated for these
	* unless profiling is enabled, so it's okay to over-provision.
	*/
	#define PROF_NCTX_LOCKS 1024

	/*
	* prof_tdata pointers close to NULL are used to encode state information that
	* is used for cleaning up during thread shutdown.
	*/
	#define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1)
	#define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2)
	#define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY

	#endif /* JEMALLOC_H_TYPES */
	/******************************************************************************/
	#ifdef JEMALLOC_H_STRUCTS

	struct prof_bt_s {
	/* Backtrace, stored as len program counters. */
	void **vec;
	unsigned len;
	};

	#ifdef JEMALLOC_PROF_LIBGCC
	/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
	typedef struct {
	prof_bt_t *bt;
	unsigned nignore;
	unsigned max;
	} prof_unwind_data_t;
	#endif

	struct prof_cnt_s {
	/*
	* Profiling counters. An allocation/deallocation pair can operate on
	* different prof_thr_cnt_t objects that are linked into the same
	* prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
	* negative. In principle it is possible for the *bytes counters to
	* overflow/underflow, but a general solution would require something
	* like 128-bit counters; this implementation doesn't bother to solve
	* that problem.
	*/
	int64_t curobjs;
	int64_t curbytes;
	uint64_t accumobjs;
	uint64_t accumbytes;
	};

	struct prof_thr_cnt_s {
	/* Linkage into prof_ctx_t's cnts_ql. */
	ql_elm(prof_thr_cnt_t) cnts_link;

	/* Linkage into thread's LRU. */
	ql_elm(prof_thr_cnt_t) lru_link;

	/*
	* Associated context. If a thread frees an object that it did not
	* allocate, it is possible that the context is not cached in the
	* thread's hash table, in which case it must be able to look up the
	* context, insert a new prof_thr_cnt_t into the thread's hash table,
	* and link it into the prof_ctx_t's cnts_ql.
	*/
	prof_ctx_t *ctx;

	/*
	* Threads use memory barriers to update the counters. Since there is
	* only ever one writer, the only challenge is for the reader to get a
	* consistent read of the counters.
	*
	* The writer uses this series of operations:
	*
	* 1) Increment epoch to an odd number.
	* 2) Update counters.
	* 3) Increment epoch to an even number.
	*
	* The reader must assure 1) that the epoch is even while it reads the
	* counters, and 2) that the epoch doesn't change between the time it
	* starts and finishes reading the counters.
	*/
	unsigned epoch;

	/* Profiling counters. */
	prof_cnt_t cnts;
	};

	struct prof_ctx_s {
	/* Associated backtrace. */
	prof_bt_t *bt;

	/* Protects nlimbo, cnt_merged, and cnts_ql. */
	malloc_mutex_t *lock;

	/*
	* Number of threads that currently cause this ctx to be in a state of
	* limbo due to one of:
	* - Initializing per thread counters associated with this ctx.
	* - Preparing to destroy this ctx.
	* nlimbo must be 1 (single destroyer) in order to safely destroy the
	* ctx.
	*/
	unsigned nlimbo;

	/* Temporary storage for summation during dump. */
	prof_cnt_t cnt_summed;

	/* When threads exit, they merge their stats into cnt_merged. */
	prof_cnt_t cnt_merged;

	/*
	* List of profile counters, one for each thread that has allocated in
	* this context.
	*/
	ql_head(prof_thr_cnt_t) cnts_ql;
	};

	struct prof_tdata_s {
	/*
	* Hash of (prof_bt_t )-->(prof_thr_cnt_t ). Each thread keeps a
	* cache of backtraces, with associated thread-specific prof_thr_cnt_t
	* objects. Other threads may read the prof_thr_cnt_t contents, but no
	* others will ever write them.
	*
	* Upon thread exit, the thread must merge all the prof_thr_cnt_t
	* counter data into the associated prof_ctx_t objects, and unlink/free
	* the prof_thr_cnt_t objects.
	*/
	ckh_t bt2cnt;

	/* LRU for contents of bt2cnt. */
	ql_head(prof_thr_cnt_t) lru_ql;

	/* Backtrace vector, used for calls to prof_backtrace(). */
	void **vec;

	/* Sampling state. */
	uint64_t prng_state;
	uint64_t threshold;
	uint64_t accum;

	/* State used to avoid dumping while operating on prof internals. */
	bool enq;
	bool enq_idump;
	bool enq_gdump;
	};

	#endif /* JEMALLOC_H_STRUCTS */
	/******************************************************************************/
	#ifdef JEMALLOC_H_EXTERNS

	extern bool opt_prof;
	/*
	* Even if opt_prof is true, sampling can be temporarily disabled by setting
	* opt_prof_active to false. No locking is used when updating opt_prof_active,
	* so there are no guarantees regarding how long it will take for all threads
	* to notice state changes.
	*/
	extern bool opt_prof_active;
	extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
	extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
	extern bool opt_prof_gdump; /* High-water memory dumping. */
	extern bool opt_prof_final; /* Final profile dumping. */
	extern bool opt_prof_leak; /* Dump leak summary at exit. */
	extern bool opt_prof_accum; /* Report cumulative bytes. */
	extern char opt_prof_prefix[PATH_MAX + 1];

	/*
	* Profile dump interval, measured in bytes allocated. Each arena triggers a
	* profile dump when it reaches this threshold. The effect is that the
	* interval between profile dumps averages prof_interval, though the actual
	* interval between dumps will tend to be sporadic, and the interval will be a
	* maximum of approximately (prof_interval * narenas).
	*/
	extern uint64_t prof_interval;

	/*
	* If true, promote small sampled objects to large objects, since small run
	* headers do not have embedded profile context pointers.
	*/
	extern bool prof_promote;

	void bt_init(prof_bt_t bt, void *vec);
	void prof_backtrace(prof_bt_t *bt, unsigned nignore);
	prof_thr_cnt_t prof_lookup(prof_bt_t bt);
	void prof_idump(void);
	bool prof_mdump(const char *filename);
	void prof_gdump(void);
	prof_tdata_t *prof_tdata_init(void);
	void prof_tdata_cleanup(void *arg);
	void prof_boot0(void);
	void prof_boot1(void);
	bool prof_boot2(void);
	void prof_prefork(void);
	void prof_postfork_parent(void);
	void prof_postfork_child(void);

	#endif /* JEMALLOC_H_EXTERNS */
	/******************************************************************************/
	#ifdef JEMALLOC_H_INLINES

	#define PROF_ALLOC_PREP(nignore, size, ret) do { \
	prof_tdata_t *prof_tdata; \
	prof_bt_t bt; \
	\
	assert(size == s2u(size)); \
	\
	prof_tdata = prof_tdata_get(true); \
	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \
	if (prof_tdata != NULL) \
	ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
	else \
	ret = NULL; \
	break; \
	} \
	\
	if (opt_prof_active == false) { \
	/* Sampling is currently inactive, so avoid sampling. */\
	ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
	} else if (opt_lg_prof_sample == 0) { \
	/* Don't bother with sampling logic, since sampling */\
	/* interval is 1. */\
	bt_init(&bt, prof_tdata->vec); \
	prof_backtrace(&bt, nignore); \
	ret = prof_lookup(&bt); \
	} else { \
	if (prof_tdata->threshold == 0) { \
	/* Initialize. Seed the prng differently for */\
	/* each thread. */\
	prof_tdata->prng_state = \
	(uint64_t)(uintptr_t)&size; \
	prof_sample_threshold_update(prof_tdata); \
	} \
	\
	/* Determine whether to capture a backtrace based on */\
	/* whether size is enough for prof_accum to reach */\
	/* prof_tdata->threshold. However, delay updating */\
	/* these variables until prof_{m,re}alloc(), because */\
	/* we don't know for sure that the allocation will */\
	/* succeed. */\
	/* */\
	/* Use subtraction rather than addition to avoid */\
	/* potential integer overflow. */\
	if (size >= prof_tdata->threshold - \
	prof_tdata->accum) { \
	bt_init(&bt, prof_tdata->vec); \
	prof_backtrace(&bt, nignore); \
	ret = prof_lookup(&bt); \
	} else \
	ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
	} \
	} while (0)

	#ifndef JEMALLOC_ENABLE_INLINE
	malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)

	prof_tdata_t *prof_tdata_get(bool create);
	void prof_sample_threshold_update(prof_tdata_t *prof_tdata);
	prof_ctx_t prof_ctx_get(const void ptr);
	void prof_ctx_set(const void ptr, prof_ctx_t ctx);
	bool prof_sample_accum_update(size_t size);
	void prof_malloc(const void ptr, size_t size, prof_thr_cnt_t cnt);
	void prof_realloc(const void ptr, size_t size, prof_thr_cnt_t cnt,
	size_t old_size, prof_ctx_t *old_ctx);
	void prof_free(const void *ptr, size_t size);
	#endif

	#if (defined(JEMALLOC_ENABLE_INLINE) \|\| defined(JEMALLOC_PROF_C_))
	/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
	malloc_tsd_externs(prof_tdata, prof_tdata_t *)
	malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
	prof_tdata_cleanup)

	JEMALLOC_INLINE prof_tdata_t *
	prof_tdata_get(bool create)
	{
	prof_tdata_t *prof_tdata;

	cassert(config_prof);

	prof_tdata = *prof_tdata_tsd_get();
	if (create && prof_tdata == NULL)
	prof_tdata = prof_tdata_init();

	return (prof_tdata);
	}

	JEMALLOC_INLINE void
	prof_sample_threshold_update(prof_tdata_t *prof_tdata)
	{
	/*
	* The body of this function is compiled out unless heap profiling is
	* enabled, so that it is possible to compile jemalloc with floating
	* point support completely disabled. Avoiding floating point code is
	* important on memory-constrained systems, but it also enables a
	* workaround for versions of glibc that don't properly save/restore
	* floating point registers during dynamic lazy symbol loading (which
	* internally calls into whatever malloc implementation happens to be
	* integrated into the application). Note that some compilers (e.g.
	* gcc 4.8) may use floating point registers for fast memory moves, so
	* jemalloc must be compiled with such optimizations disabled (e.g.
	* -mno-sse) in order for the workaround to be complete.
	*/
	#ifdef JEMALLOC_PROF
	uint64_t r;
	double u;

	cassert(config_prof);

	/*
	* Compute sample threshold as a geometrically distributed random
	* variable with mean (2^opt_lg_prof_sample).
	*
	* __ __
	* \| log(u) \| 1
	* prof_tdata->threshold = \| -------- \|, where p = -------------------
	* \| log(1-p) \| opt_lg_prof_sample
	* 2
	*
	* For more information on the math, see:
	*
	* Non-Uniform Random Variate Generation
	* Luc Devroye
	* Springer-Verlag, New York, 1986
	* pp 500
	* (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
	*/
	prng64(r, 53, prof_tdata->prng_state,
	UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
	u = (double)r * (1.0/9007199254740992.0L);
	prof_tdata->threshold = (uint64_t)(log(u) /
	log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
	+ (uint64_t)1U;
	#endif
	}

	JEMALLOC_INLINE prof_ctx_t *
	prof_ctx_get(const void *ptr)
	{
	prof_ctx_t *ret;
	arena_chunk_t *chunk;

	cassert(config_prof);
	assert(ptr != NULL);

	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
	if (chunk != ptr) {
	/* Region. */
	ret = arena_prof_ctx_get(ptr);
	} else
	ret = huge_prof_ctx_get(ptr);

	return (ret);
	}

	JEMALLOC_INLINE void
	prof_ctx_set(const void ptr, prof_ctx_t ctx)
	{
	arena_chunk_t *chunk;

	cassert(config_prof);
	assert(ptr != NULL);

	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
	if (chunk != ptr) {
	/* Region. */
	arena_prof_ctx_set(ptr, ctx);
	} else
	huge_prof_ctx_set(ptr, ctx);
	}

	JEMALLOC_INLINE bool
	prof_sample_accum_update(size_t size)
	{
	prof_tdata_t *prof_tdata;

	cassert(config_prof);
	/* Sampling logic is unnecessary if the interval is 1. */
	assert(opt_lg_prof_sample != 0);

	prof_tdata = prof_tdata_get(false);
	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
	return (true);

	/* Take care to avoid integer overflow. */
	if (size >= prof_tdata->threshold - prof_tdata->accum) {
	prof_tdata->accum -= (prof_tdata->threshold - size);
	/* Compute new sample threshold. */
	prof_sample_threshold_update(prof_tdata);
	while (prof_tdata->accum >= prof_tdata->threshold) {
	prof_tdata->accum -= prof_tdata->threshold;
	prof_sample_threshold_update(prof_tdata);
	}
	return (false);
	} else {
	prof_tdata->accum += size;
	return (true);
	}
	}

	JEMALLOC_INLINE void
	prof_malloc(const void ptr, size_t size, prof_thr_cnt_t cnt)
	{

	cassert(config_prof);
	assert(ptr != NULL);
	assert(size == isalloc(ptr, true));

	if (opt_lg_prof_sample != 0) {
	if (prof_sample_accum_update(size)) {
	/*
	* Don't sample. For malloc()-like allocation, it is
	* always possible to tell in advance how large an
	* object's usable size will be, so there should never
	* be a difference between the size passed to
	* PROF_ALLOC_PREP() and prof_malloc().
	*/
	assert((uintptr_t)cnt == (uintptr_t)1U);
	}
	}

	if ((uintptr_t)cnt > (uintptr_t)1U) {
	prof_ctx_set(ptr, cnt->ctx);

	cnt->epoch++;
	/*********/
	mb_write();
	/*********/
	cnt->cnts.curobjs++;
	cnt->cnts.curbytes += size;
	if (opt_prof_accum) {
	cnt->cnts.accumobjs++;
	cnt->cnts.accumbytes += size;
	}
	/*********/
	mb_write();
	/*********/
	cnt->epoch++;
	/*********/
	mb_write();
	/*********/
	} else
	prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
	}

	JEMALLOC_INLINE void
	prof_realloc(const void ptr, size_t size, prof_thr_cnt_t cnt,
	size_t old_size, prof_ctx_t *old_ctx)
	{
	prof_thr_cnt_t *told_cnt;

	cassert(config_prof);
	assert(ptr != NULL \|\| (uintptr_t)cnt <= (uintptr_t)1U);

	if (ptr != NULL) {
	assert(size == isalloc(ptr, true));
	if (opt_lg_prof_sample != 0) {
	if (prof_sample_accum_update(size)) {
	/*
	* Don't sample. The size passed to
	* PROF_ALLOC_PREP() was larger than what
	* actually got allocated, so a backtrace was
	* captured for this allocation, even though
	* its actual size was insufficient to cross
	* the sample threshold.
	*/
	cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
	}
	}
	}

	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
	told_cnt = prof_lookup(old_ctx->bt);
	if (told_cnt == NULL) {
	/*
	* It's too late to propagate OOM for this realloc(),
	* so operate directly on old_cnt->ctx->cnt_merged.
	*/
	malloc_mutex_lock(old_ctx->lock);
	old_ctx->cnt_merged.curobjs--;
	old_ctx->cnt_merged.curbytes -= old_size;
	malloc_mutex_unlock(old_ctx->lock);
	told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
	}
	} else
	told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;

	if ((uintptr_t)told_cnt > (uintptr_t)1U)
	told_cnt->epoch++;
	if ((uintptr_t)cnt > (uintptr_t)1U) {
	prof_ctx_set(ptr, cnt->ctx);
	cnt->epoch++;
	} else if (ptr != NULL)
	prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
	/*********/
	mb_write();
	/*********/
	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
	told_cnt->cnts.curobjs--;
	told_cnt->cnts.curbytes -= old_size;
	}
	if ((uintptr_t)cnt > (uintptr_t)1U) {
	cnt->cnts.curobjs++;
	cnt->cnts.curbytes += size;
	if (opt_prof_accum) {
	cnt->cnts.accumobjs++;
	cnt->cnts.accumbytes += size;
	}
	}
	/*********/
	mb_write();
	/*********/
	if ((uintptr_t)told_cnt > (uintptr_t)1U)
	told_cnt->epoch++;
	if ((uintptr_t)cnt > (uintptr_t)1U)
	cnt->epoch++;
	/*********/
	mb_write(); /* Not strictly necessary. */
	}

	JEMALLOC_INLINE void
	prof_free(const void *ptr, size_t size)
	{
	prof_ctx_t *ctx = prof_ctx_get(ptr);

	cassert(config_prof);

	if ((uintptr_t)ctx > (uintptr_t)1) {
	prof_thr_cnt_t *tcnt;
	assert(size == isalloc(ptr, true));
	tcnt = prof_lookup(ctx->bt);

	if (tcnt != NULL) {
	tcnt->epoch++;
	/*********/
	mb_write();
	/*********/
	tcnt->cnts.curobjs--;
	tcnt->cnts.curbytes -= size;
	/*********/
	mb_write();
	/*********/
	tcnt->epoch++;
	/*********/
	mb_write();
	/*********/
	} else {
	/*
	* OOM during free() cannot be propagated, so operate
	* directly on cnt->ctx->cnt_merged.
	*/
	malloc_mutex_lock(ctx->lock);
	ctx->cnt_merged.curobjs--;
	ctx->cnt_merged.curbytes -= size;
	malloc_mutex_unlock(ctx->lock);
	}
	}
	}
	#endif

	#endif /* JEMALLOC_H_INLINES */
	/******************************************************************************/