Blame - Lib/statistics.py - platform/external/python/cpython3

blob: 19db8e82801013439a3bc99fcf851e7e97d761b0 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
				76	Exceptions
				77	----------
				78
				79	A single exception is defined: StatisticsError is a subclass of ValueError.
				80
				81	"""
				82
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	83	__all__ = [ 'StatisticsError', 'NormalDist', 'quantiles',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	84	'pstdev', 'pvariance', 'stdev', 'variance',
				85	'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	86	'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	87	'geometric_mean',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	88	]
				89
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	90	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	91	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	92	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	93
				94	from fractions import Fraction
				95	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	96	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	97	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	98	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	99	from operator import itemgetter
				100	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	101
				102	# === Exceptions ===
				103
				104	class StatisticsError(ValueError):
				105	pass
				106
				107
				108	# === Private utilities ===
				109
				110	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	111	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	112
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	113	Return a high-precision sum of the given numeric data as a fraction,
				114	together with the type to be converted to and the count of items.
				115
				116	If optional argument ``start`` is given, it is added to the total.
				117	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	118
				119
				120	Examples
				121	--------
				122
				123	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	124	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	125
				126	Some sources of round-off error will be avoided:
				127
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	128	# Built-in sum returns zero.
				129	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	130	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	131
				132	Fractions and Decimals are also supported:
				133
				134	>>> from fractions import Fraction as F
				135	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	136	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	137
				138	>>> from decimal import Decimal as D
				139	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				140	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	141	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	142
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	143	Mixed types are currently treated as an error, except that int is
				144	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	145	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	146	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	147	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	148	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	149	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	150	T = _coerce(int, type(start))
				151	for typ, values in groupby(data, type):
				152	T = _coerce(T, typ) # or raise TypeError
				153	for n,d in map(_exact_ratio, values):
				154	count += 1
				155	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	156	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	157	# The sum will be a NAN or INF. We can ignore all the finite
				158	# partials, and just look at this special one.
				159	total = partials[None]
				160	assert not _isfinite(total)
				161	else:
				162	# Sum all the partial sums using builtin sum.
				163	# FIXME is this faster if we sum them in order of the denominator?
				164	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				165	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	166
				167
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	168	def _isfinite(x):
				169	try:
				170	return x.is_finite() # Likely a Decimal.
				171	except AttributeError:
				172	return math.isfinite(x) # Coerces to float first.
				173
				174
				175	def _coerce(T, S):
				176	"""Coerce types T and S to a common type, or raise TypeError.
				177
				178	Coercion rules are currently an implementation detail. See the CoerceTest
				179	test class in test_statistics for details.
				180	"""
				181	# See http://bugs.python.org/issue24068.
				182	assert T is not bool, "initial type T is bool"
				183	# If the types are the same, no need to coerce anything. Put this
				184	# first, so that the usual case (no coercion needed) happens as soon
				185	# as possible.
				186	if T is S: return T
				187	# Mixed int & other coerce to the other type.
				188	if S is int or S is bool: return T
				189	if T is int: return S
				190	# If one is a (strict) subclass of the other, coerce to the subclass.
				191	if issubclass(S, T): return S
				192	if issubclass(T, S): return T
				193	# Ints coerce to the other type.
				194	if issubclass(T, int): return S
				195	if issubclass(S, int): return T
				196	# Mixed fraction & float coerces to float (or float subclass).
				197	if issubclass(T, Fraction) and issubclass(S, float):
				198	return S
				199	if issubclass(T, float) and issubclass(S, Fraction):
				200	return T
				201	# Any other combination is disallowed.
				202	msg = "don't know how to coerce %s and %s"
				203	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	204
				205
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	206	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	207	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	208
				209	>>> _exact_ratio(0.25)
				210	(1, 4)
				211
				212	x is expected to be an int, Fraction, Decimal or float.
				213	"""
				214	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	215	# Optimise the common case of floats. We expect that the most often
				216	# used numeric type will be builtin floats, so try to make this as
				217	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	218	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	219	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	220	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	221	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	222	return (x.numerator, x.denominator)
				223	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	224	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	225	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	226	return x.as_integer_ratio()
				227	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	228	# Just give up?
				229	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	230	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	231	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	232	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	233	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	msg = "can't convert type '{}' to numerator/denominator"
				235	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	236
				237
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	238	def _convert(value, T):
				239	"""Convert value to given numeric type T."""
				240	if type(value) is T:
				241	# This covers the cases where T is Fraction, or where value is
				242	# a NAN or INF (Decimal or float).
				243	return value
				244	if issubclass(T, int) and value.denominator != 1:
				245	T = float
				246	try:
				247	# FIXME: what do we do if this overflows?
				248	return T(value)
				249	except TypeError:
				250	if issubclass(T, Decimal):
				251	return T(value.numerator)/T(value.denominator)
				252	else:
				253	raise
				254
				255
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	256	def _find_lteq(a, x):
				257	'Locate the leftmost value exactly equal to x'
				258	i = bisect_left(a, x)
				259	if i != len(a) and a[i] == x:
				260	return i
				261	raise ValueError
				262
				263
				264	def _find_rteq(a, l, x):
				265	'Locate the rightmost value exactly equal to x'
				266	i = bisect_right(a, x, lo=l)
				267	if i != (len(a)+1) and a[i-1] == x:
				268	return i-1
				269	raise ValueError
				270
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	271
				272	def _fail_neg(values, errmsg='negative value'):
				273	"""Iterate over values, failing if any are less than zero."""
				274	for x in values:
				275	if x < 0:
				276	raise StatisticsError(errmsg)
				277	yield x
				278
				279
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	280	# === Measures of central tendency (averages) ===
				281
				282	def mean(data):
				283	"""Return the sample arithmetic mean of data.
				284
				285	>>> mean([1, 2, 3, 4, 4])
				286	2.8
				287
				288	>>> from fractions import Fraction as F
				289	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				290	Fraction(13, 21)
				291
				292	>>> from decimal import Decimal as D
				293	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				294	Decimal('0.5625')
				295
				296	If ``data`` is empty, StatisticsError will be raised.
				297	"""
				298	if iter(data) is data:
				299	data = list(data)
				300	n = len(data)
				301	if n < 1:
				302	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	303	T, total, count = _sum(data)
				304	assert count == n
				305	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	306
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	307	def fmean(data):
				308	""" Convert data to floats and compute the arithmetic mean.
				309
				310	This runs faster than the mean() function and it always returns a float.
				311	The result is highly accurate but not as perfect as mean().
				312	If the input dataset is empty, it raises a StatisticsError.
				313
				314	>>> fmean([3.5, 4.0, 5.25])
				315	4.25
				316
				317	"""
				318	try:
				319	n = len(data)
				320	except TypeError:
				321	# Handle iterators that do not define __len__().
				322	n = 0
				323	def count(x):
				324	nonlocal n
				325	n += 1
				326	return x
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	327	total = fsum(map(count, data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	328	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	329	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	330	try:
				331	return total / n
				332	except ZeroDivisionError:
				333	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	334
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	335	def geometric_mean(data):
				336	"""Convert data to floats and compute the geometric mean.
				337
				338	Raises a StatisticsError if the input dataset is empty,
				339	if it contains a zero, or if it contains a negative value.
				340
				341	No special efforts are made to achieve exact results.
				342	(However, this may change in the future.)
				343
				344	>>> round(geometric_mean([54, 24, 36]), 9)
				345	36.0
				346	"""
				347	try:
				348	return exp(fmean(map(log, data)))
				349	except ValueError:
				350	raise StatisticsError('geometric mean requires a non-empty dataset '
				351	' containing positive numbers') from None
				352
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	353	def harmonic_mean(data):
				354	"""Return the harmonic mean of data.
				355
				356	The harmonic mean, sometimes called the subcontrary mean, is the
				357	reciprocal of the arithmetic mean of the reciprocals of the data,
				358	and is often appropriate when averaging quantities which are rates
				359	or ratios, for example speeds. Example:
				360
				361	Suppose an investor purchases an equal value of shares in each of
				362	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				363	What is the average P/E ratio for the investor's portfolio?
				364
				365	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				366	3.6
				367
				368	Using the arithmetic mean would give an average of about 5.167, which
				369	is too high.
				370
				371	If ``data`` is empty, or any element is less than zero,
				372	``harmonic_mean`` will raise ``StatisticsError``.
				373	"""
				374	# For a justification for using harmonic mean for P/E ratios, see
				375	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				376	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				377	if iter(data) is data:
				378	data = list(data)
				379	errmsg = 'harmonic mean does not support negative values'
				380	n = len(data)
				381	if n < 1:
				382	raise StatisticsError('harmonic_mean requires at least one data point')
				383	elif n == 1:
				384	x = data[0]
				385	if isinstance(x, (numbers.Real, Decimal)):
				386	if x < 0:
				387	raise StatisticsError(errmsg)
				388	return x
				389	else:
				390	raise TypeError('unsupported type')
				391	try:
				392	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				393	except ZeroDivisionError:
				394	return 0
				395	assert count == n
				396	return _convert(n/total, T)
				397
				398
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	399	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				400	def median(data):
				401	"""Return the median (middle value) of numeric data.
				402
				403	When the number of data points is odd, return the middle data point.
				404	When the number of data points is even, the median is interpolated by
				405	taking the average of the two middle values:
				406
				407	>>> median([1, 3, 5])
				408	3
				409	>>> median([1, 3, 5, 7])
				410	4.0
				411
				412	"""
				413	data = sorted(data)
				414	n = len(data)
				415	if n == 0:
				416	raise StatisticsError("no median for empty data")
				417	if n%2 == 1:
				418	return data[n//2]
				419	else:
				420	i = n//2
				421	return (data[i - 1] + data[i])/2
				422
				423
				424	def median_low(data):
				425	"""Return the low median of numeric data.
				426
				427	When the number of data points is odd, the middle value is returned.
				428	When it is even, the smaller of the two middle values is returned.
				429
				430	>>> median_low([1, 3, 5])
				431	3
				432	>>> median_low([1, 3, 5, 7])
				433	3
				434
				435	"""
				436	data = sorted(data)
				437	n = len(data)
				438	if n == 0:
				439	raise StatisticsError("no median for empty data")
				440	if n%2 == 1:
				441	return data[n//2]
				442	else:
				443	return data[n//2 - 1]
				444
				445
				446	def median_high(data):
				447	"""Return the high median of data.
				448
				449	When the number of data points is odd, the middle value is returned.
				450	When it is even, the larger of the two middle values is returned.
				451
				452	>>> median_high([1, 3, 5])
				453	3
				454	>>> median_high([1, 3, 5, 7])
				455	5
				456
				457	"""
				458	data = sorted(data)
				459	n = len(data)
				460	if n == 0:
				461	raise StatisticsError("no median for empty data")
				462	return data[n//2]
				463
				464
				465	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	466	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	467
				468	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				469	3.7
				470	>>> median_grouped([52, 52, 53, 54])
				471	52.5
				472
				473	This calculates the median as the 50th percentile, and should be
				474	used when your data is continuous and grouped. In the above example,
				475	the values 1, 2, 3, etc. actually represent the midpoint of classes
				476	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				477	class 3.5-4.5, and interpolation is used to estimate it.
				478
				479	Optional argument ``interval`` represents the class interval, and
				480	defaults to 1. Changing the class interval naturally will change the
				481	interpolated 50th percentile value:
				482
				483	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				484	3.25
				485	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				486	3.5
				487
				488	This function does not check whether the data points are at least
				489	``interval`` apart.
				490	"""
				491	data = sorted(data)
				492	n = len(data)
				493	if n == 0:
				494	raise StatisticsError("no median for empty data")
				495	elif n == 1:
				496	return data[0]
				497	# Find the value at the midpoint. Remember this corresponds to the
				498	# centre of the class interval.
				499	x = data[n//2]
				500	for obj in (x, interval):
				501	if isinstance(obj, (str, bytes)):
				502	raise TypeError('expected number but got %r' % obj)
				503	try:
				504	L = x - interval/2 # The lower limit of the median interval.
				505	except TypeError:
				506	# Mixed type. For now we just coerce to float.
				507	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	508
				509	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	510	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	511	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	512	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	513	# Assuming always l1 <= l2
				514	l2 = _find_rteq(data, l1, x)
				515	cf = l1
				516	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	517	return L + interval*(n/2 - cf)/f
				518
				519
				520	def mode(data):
				521	"""Return the most common data point from discrete or nominal data.
				522
				523	``mode`` assumes discrete data, and returns a single value. This is the
				524	standard treatment of the mode as commonly taught in schools:
				525
				526	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				527	3
				528
				529	This also works with nominal (non-numeric) data:
				530
				531	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				532	'red'
				533
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	534	If there are multiple modes, return the first one encountered.
				535
				536	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				537	'red'
				538
				539	If data is empty, ``mode``, raises StatisticsError.
				540
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	541	"""
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	542	data = iter(data)
				543	try:
				544	return Counter(data).most_common(1)[0][0]
				545	except IndexError:
				546	raise StatisticsError('no mode for empty data') from None
				547
				548
				549	def multimode(data):
				550	""" Return a list of the most frequently occurring values.
				551
				552	Will return more than one result if there are multiple modes
				553	or an empty list if data is empty.
				554
				555	>>> multimode('aabbbbbbbbcc')
				556	['b']
				557	>>> multimode('aabbbbccddddeeffffgg')
				558	['b', 'd', 'f']
				559	>>> multimode('')
				560	[]
				561
				562	"""
				563	counts = Counter(iter(data)).most_common()
				564	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				565	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	566
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	567	def quantiles(dist, *, n=4, method='exclusive'):
				568	'''Divide dist into n continuous intervals with equal probability.
				569
				570	Returns a list of (n - 1) cut points separating the intervals.
				571
				572	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				573	Set n to 100 for percentiles which gives the 99 cuts points that
				574	separate dist in to 100 equal sized groups.
				575
				576	The dist can be any iterable containing sample data or it can be
				577	an instance of a class that defines an inv_cdf() method. For sample
				578	data, the cut points are linearly interpolated between data points.
				579
				580	If method is set to inclusive, dist is treated as population
				581	data. The minimum value is treated as the 0th percentile and the
				582	maximum value is treated as the 100th percentile.
				583	'''
				584	# Possible future API extensions:
				585	# quantiles(data, already_sorted=True)
				586	# quantiles(data, cut_points=[0.02, 0.25, 0.50, 0.75, 0.98])
				587	if n < 1:
				588	raise StatisticsError('n must be at least 1')
				589	if hasattr(dist, 'inv_cdf'):
				590	return [dist.inv_cdf(i / n) for i in range(1, n)]
				591	data = sorted(dist)
				592	ld = len(data)
				593	if ld < 2:
				594	raise StatisticsError('must have at least two data points')
				595	if method == 'inclusive':
				596	m = ld - 1
				597	result = []
				598	for i in range(1, n):
				599	j = i * m // n
				600	delta = im - jn
				601	interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
				602	result.append(interpolated)
				603	return result
				604	if method == 'exclusive':
				605	m = ld + 1
				606	result = []
				607	for i in range(1, n):
				608	j = i * m // n # rescale i to m/n
				609	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				610	delta = im - jn # exact integer math
				611	interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
				612	result.append(interpolated)
				613	return result
				614	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	615
				616	# === Measures of spread ===
				617
				618	# See http://mathworld.wolfram.com/Variance.html
				619	# http://mathworld.wolfram.com/SampleVariance.html
				620	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				621	#
				622	# Under no circumstances use the so-called "computational formula for
				623	# variance", as that is only suitable for hand calculations with a small
				624	# amount of low-precision data. It has terrible numeric properties.
				625	#
				626	# See a comparison of three computational methods here:
				627	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				628
				629	def _ss(data, c=None):
				630	"""Return sum of square deviations of sequence data.
				631
				632	If ``c`` is None, the mean is calculated in one pass, and the deviations
				633	from the mean are calculated in a second pass. Otherwise, deviations are
				634	calculated from ``c`` as given. Use the second case with care, as it can
				635	lead to garbage results.
				636	"""
				637	if c is None:
				638	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	639	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	640	# The following sum should mathematically equal zero, but due to rounding
				641	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	642	U, total2, count2 = _sum((x-c) for x in data)
				643	assert T == U and count == count2
				644	total -= total2**2/len(data)
				645	assert not total < 0, 'negative sum of square deviations: %f' % total
				646	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	647
				648
				649	def variance(data, xbar=None):
				650	"""Return the sample variance of data.
				651
				652	data should be an iterable of Real-valued numbers, with at least two
				653	values. The optional argument xbar, if given, should be the mean of
				654	the data. If it is missing or None, the mean is automatically calculated.
				655
				656	Use this function when your data is a sample from a population. To
				657	calculate the variance from the entire population, see ``pvariance``.
				658
				659	Examples:
				660
				661	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				662	>>> variance(data)
				663	1.3720238095238095
				664
				665	If you have already calculated the mean of your data, you can pass it as
				666	the optional second argument ``xbar`` to avoid recalculating it:
				667
				668	>>> m = mean(data)
				669	>>> variance(data, m)
				670	1.3720238095238095
				671
				672	This function does not check that ``xbar`` is actually the mean of
				673	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				674	impossible results.
				675
				676	Decimals and Fractions are supported:
				677
				678	>>> from decimal import Decimal as D
				679	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				680	Decimal('31.01875')
				681
				682	>>> from fractions import Fraction as F
				683	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				684	Fraction(67, 108)
				685
				686	"""
				687	if iter(data) is data:
				688	data = list(data)
				689	n = len(data)
				690	if n < 2:
				691	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	692	T, ss = _ss(data, xbar)
				693	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	694
				695
				696	def pvariance(data, mu=None):
				697	"""Return the population variance of ``data``.
				698
				699	data should be an iterable of Real-valued numbers, with at least one
				700	value. The optional argument mu, if given, should be the mean of
				701	the data. If it is missing or None, the mean is automatically calculated.
				702
				703	Use this function to calculate the variance from the entire population.
				704	To estimate the variance from a sample, the ``variance`` function is
				705	usually a better choice.
				706
				707	Examples:
				708
				709	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				710	>>> pvariance(data)
				711	1.25
				712
				713	If you have already calculated the mean of the data, you can pass it as
				714	the optional second argument to avoid recalculating it:
				715
				716	>>> mu = mean(data)
				717	>>> pvariance(data, mu)
				718	1.25
				719
				720	This function does not check that ``mu`` is actually the mean of ``data``.
				721	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				722	results.
				723
				724	Decimals and Fractions are supported:
				725
				726	>>> from decimal import Decimal as D
				727	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				728	Decimal('24.815')
				729
				730	>>> from fractions import Fraction as F
				731	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				732	Fraction(13, 72)
				733
				734	"""
				735	if iter(data) is data:
				736	data = list(data)
				737	n = len(data)
				738	if n < 1:
				739	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	740	T, ss = _ss(data, mu)
				741	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	742
				743
				744	def stdev(data, xbar=None):
				745	"""Return the square root of the sample variance.
				746
				747	See ``variance`` for arguments and other details.
				748
				749	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				750	1.0810874155219827
				751
				752	"""
				753	var = variance(data, xbar)
				754	try:
				755	return var.sqrt()
				756	except AttributeError:
				757	return math.sqrt(var)
				758
				759
				760	def pstdev(data, mu=None):
				761	"""Return the square root of the population variance.
				762
				763	See ``pvariance`` for arguments and other details.
				764
				765	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				766	0.986893273527251
				767
				768	"""
				769	var = pvariance(data, mu)
				770	try:
				771	return var.sqrt()
				772	except AttributeError:
				773	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	774
				775	## Normal Distribution #####################################################
				776
				777	class NormalDist:
				778	'Normal distribution of a random variable'
				779	# https://en.wikipedia.org/wiki/Normal_distribution
				780	# https://en.wikipedia.org/wiki/Variance#Properties
				781
Raymond Hettinger	d1e768a	2019-03-25 13:01:13 -0700	[diff] [blame]	782	__slots__ = {'mu': 'Arithmetic mean of a normal distribution',
				783	'sigma': 'Standard deviation of a normal distribution'}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	784
				785	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	786	'NormalDist where mu is the mean and sigma is the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	787	if sigma < 0.0:
				788	raise StatisticsError('sigma must be non-negative')
				789	self.mu = mu
				790	self.sigma = sigma
				791
				792	@classmethod
				793	def from_samples(cls, data):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	794	'Make a normal distribution instance from sample data.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	795	if not isinstance(data, (list, tuple)):
				796	data = list(data)
				797	xbar = fmean(data)
				798	return cls(xbar, stdev(data, xbar))
				799
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	800	def samples(self, n, *, seed=None):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	801	'Generate n samples for a given mean and standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	802	gauss = random.gauss if seed is None else random.Random(seed).gauss
				803	mu, sigma = self.mu, self.sigma
				804	return [gauss(mu, sigma) for i in range(n)]
				805
				806	def pdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	807	'Probability density function. P(x <= X < x+dx) / dx'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	808	variance = self.sigma ** 2.0
				809	if not variance:
				810	raise StatisticsError('pdf() not defined when sigma is zero')
				811	return exp((x - self.mu)*2.0 / (-2.0variance)) / sqrt(tau * variance)
				812
				813	def cdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	814	'Cumulative distribution function. P(X <= x)'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	815	if not self.sigma:
				816	raise StatisticsError('cdf() not defined when sigma is zero')
				817	return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
				818
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	819	def inv_cdf(self, p):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	820	'''Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	821
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	822	Finds the value of the random variable such that the probability of the
				823	variable being less than or equal to that value equals the given probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	824
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	825	This function is also called the percent point function or quantile function.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	826	'''
				827	if (p <= 0.0 or p >= 1.0):
				828	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
				829	if self.sigma <= 0.0:
				830	raise StatisticsError('cdf() not defined when sigma at or below zero')
				831
				832	# There is no closed-form solution to the inverse CDF for the normal
				833	# distribution, so we use a rational approximation instead:
				834	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				835	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				836	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				837
				838	q = p - 0.5
				839	if fabs(q) <= 0.425:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	840	r = 0.180625 - q * q
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	841	num = (((((((2.50908_09287_30122_6727e+3 * r +
				842	3.34305_75583_58812_8105e+4) * r +
				843	6.72657_70927_00870_0853e+4) * r +
				844	4.59219_53931_54987_1457e+4) * r +
				845	1.37316_93765_50946_1125e+4) * r +
				846	1.97159_09503_06551_4427e+3) * r +
				847	1.33141_66789_17843_7745e+2) * r +
				848	3.38713_28727_96366_6080e+0) * q
				849	den = (((((((5.22649_52788_52854_5610e+3 * r +
				850	2.87290_85735_72194_2674e+4) * r +
				851	3.93078_95800_09271_0610e+4) * r +
				852	2.12137_94301_58659_5867e+4) * r +
				853	5.39419_60214_24751_1077e+3) * r +
				854	6.87187_00749_20579_0830e+2) * r +
				855	4.23133_30701_60091_1252e+1) * r +
				856	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	857	x = num / den
				858	return self.mu + (x * self.sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	859	r = p if q <= 0.0 else 1.0 - p
				860	r = sqrt(-log(r))
				861	if r <= 5.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	862	r = r - 1.6
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	863	num = (((((((7.74545_01427_83414_07640e-4 * r +
				864	2.27238_44989_26918_45833e-2) * r +
				865	2.41780_72517_74506_11770e-1) * r +
				866	1.27045_82524_52368_38258e+0) * r +
				867	3.64784_83247_63204_60504e+0) * r +
				868	5.76949_72214_60691_40550e+0) * r +
				869	4.63033_78461_56545_29590e+0) * r +
				870	1.42343_71107_49683_57734e+0)
				871	den = (((((((1.05075_00716_44416_84324e-9 * r +
				872	5.47593_80849_95344_94600e-4) * r +
				873	1.51986_66563_61645_71966e-2) * r +
				874	1.48103_97642_74800_74590e-1) * r +
				875	6.89767_33498_51000_04550e-1) * r +
				876	1.67638_48301_83803_84940e+0) * r +
				877	2.05319_16266_37758_82187e+0) * r +
				878	1.0)
Raymond Hettinger	52a594b	2019-03-19 12:48:04 -0700	[diff] [blame]	879	else:
				880	r = r - 5.0
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	881	num = (((((((2.01033_43992_92288_13265e-7 * r +
				882	2.71155_55687_43487_57815e-5) * r +
				883	1.24266_09473_88078_43860e-3) * r +
				884	2.65321_89526_57612_30930e-2) * r +
				885	2.96560_57182_85048_91230e-1) * r +
				886	1.78482_65399_17291_33580e+0) * r +
				887	5.46378_49111_64114_36990e+0) * r +
				888	6.65790_46435_01103_77720e+0)
				889	den = (((((((2.04426_31033_89939_78564e-15 * r +
				890	1.42151_17583_16445_88870e-7) * r +
				891	1.84631_83175_10054_68180e-5) * r +
				892	7.86869_13114_56132_59100e-4) * r +
				893	1.48753_61290_85061_48525e-2) * r +
				894	1.36929_88092_27358_05310e-1) * r +
				895	5.99832_20655_58879_37690e-1) * r +
				896	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	897	x = num / den
				898	if q < 0.0:
				899	x = -x
				900	return self.mu + (x * self.sigma)
				901
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	902	def overlap(self, other):
				903	'''Compute the overlapping coefficient (OVL) between two normal distributions.
				904
				905	Measures the agreement between two normal probability distributions.
				906	Returns a value between 0.0 and 1.0 giving the overlapping area in
				907	the two underlying probability density functions.
				908
				909	>>> N1 = NormalDist(2.4, 1.6)
				910	>>> N2 = NormalDist(3.2, 2.0)
				911	>>> N1.overlap(N2)
				912	0.8035050657330205
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	913	'''
				914	# See: "The overlapping coefficient as a measure of agreement between
				915	# probability distributions and point estimation of the overlap of two
				916	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				917	# http://dx.doi.org/10.1080/03610928908830127
				918	if not isinstance(other, NormalDist):
				919	raise TypeError('Expected another NormalDist instance')
				920	X, Y = self, other
				921	if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
				922	X, Y = Y, X
				923	X_var, Y_var = X.variance, Y.variance
				924	if not X_var or not Y_var:
				925	raise StatisticsError('overlap() not defined when sigma is zero')
				926	dv = Y_var - X_var
				927	dm = fabs(Y.mu - X.mu)
				928	if not dv:
Raymond Hettinger	41f0b78	2019-03-14 02:25:26 -0700	[diff] [blame]	929	return 1.0 - erf(dm / (2.0 * X.sigma * sqrt(2.0)))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	930	a = X.mu * Y_var - Y.mu * X_var
				931	b = X.sigma * Y.sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
				932	x1 = (a + b) / dv
				933	x2 = (a - b) / dv
				934	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				935
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	936	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	937	def mean(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	938	'Arithmetic mean of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	939	return self.mu
				940
				941	@property
				942	def stdev(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	943	'Standard deviation of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	944	return self.sigma
				945
				946	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	947	def variance(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	948	'Square of the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	949	return self.sigma ** 2.0
				950
				951	def __add__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	952	'''Add a constant or another NormalDist instance.
				953
				954	If other is a constant, translate mu by the constant,
				955	leaving sigma unchanged.
				956
				957	If other is a NormalDist, add both the means and the variances.
				958	Mathematically, this works only if the two distributions are
				959	independent or if they are jointly normally distributed.
				960	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	961	if isinstance(x2, NormalDist):
				962	return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
				963	return NormalDist(x1.mu + x2, x1.sigma)
				964
				965	def __sub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	966	'''Subtract a constant or another NormalDist instance.
				967
				968	If other is a constant, translate by the constant mu,
				969	leaving sigma unchanged.
				970
				971	If other is a NormalDist, subtract the means and add the variances.
				972	Mathematically, this works only if the two distributions are
				973	independent or if they are jointly normally distributed.
				974	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	975	if isinstance(x2, NormalDist):
				976	return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
				977	return NormalDist(x1.mu - x2, x1.sigma)
				978
				979	def __mul__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	980	'''Multiply both mu and sigma by a constant.
				981
				982	Used for rescaling, perhaps to change measurement units.
				983	Sigma is scaled with the absolute value of the constant.
				984	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	985	return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
				986
				987	def __truediv__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	988	'''Divide both mu and sigma by a constant.
				989
				990	Used for rescaling, perhaps to change measurement units.
				991	Sigma is scaled with the absolute value of the constant.
				992	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	993	return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
				994
				995	def __pos__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	996	'Return a copy of the instance.'
Raymond Hettinger	79fbcc5	2019-02-23 22:19:01 -0800	[diff] [blame]	997	return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	998
				999	def __neg__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1000	'Negates mu while keeping sigma the same.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1001	return NormalDist(-x1.mu, x1.sigma)
				1002
				1003	__radd__ = __add__
				1004
				1005	def __rsub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1006	'Subtract a NormalDist from a constant or another NormalDist.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1007	return -(x1 - x2)
				1008
				1009	__rmul__ = __mul__
				1010
				1011	def __eq__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1012	'Two NormalDist objects are equal if their mu and sigma are both equal.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1013	if not isinstance(x2, NormalDist):
				1014	return NotImplemented
				1015	return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
				1016
				1017	def __repr__(self):
				1018	return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
				1019
				1020
				1021	if __name__ == '__main__':
				1022
				1023	# Show math operations computed analytically in comparsion
				1024	# to a monte carlo simulation of the same operations
				1025
				1026	from math import isclose
				1027	from operator import add, sub, mul, truediv
				1028	from itertools import repeat
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1029	import doctest
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1030
				1031	g1 = NormalDist(10, 20)
				1032	g2 = NormalDist(-5, 25)
				1033
				1034	# Test scaling by a constant
				1035	assert (g1 * 5 / 5).mu == g1.mu
				1036	assert (g1 * 5 / 5).sigma == g1.sigma
				1037
				1038	n = 100_000
				1039	G1 = g1.samples(n)
				1040	G2 = g2.samples(n)
				1041
				1042	for func in (add, sub):
				1043	print(f'\nTest {func.__name__} with another NormalDist:')
				1044	print(func(g1, g2))
				1045	print(NormalDist.from_samples(map(func, G1, G2)))
				1046
				1047	const = 11
				1048	for func in (add, sub, mul, truediv):
				1049	print(f'\nTest {func.__name__} with a constant:')
				1050	print(func(g1, const))
				1051	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				1052
				1053	const = 19
				1054	for func in (add, sub, mul):
				1055	print(f'\nTest constant with {func.__name__}:')
				1056	print(func(const, g1))
				1057	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				1058
				1059	def assert_close(G1, G2):
				1060	assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
				1061	assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
				1062
				1063	X = NormalDist(-105, 73)
				1064	Y = NormalDist(31, 47)
				1065	s = 32.75
				1066	n = 100_000
				1067
				1068	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				1069	assert_close(X + s, S)
				1070
				1071	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				1072	assert_close(X - s, S)
				1073
				1074	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				1075	assert_close(X * s, S)
				1076
				1077	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				1078	assert_close(X / s, S)
				1079
				1080	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				1081	Y.samples(n))])
				1082	assert_close(X + Y, S)
				1083
				1084	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				1085	Y.samples(n))])
				1086	assert_close(X - Y, S)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1087
				1088	print(doctest.testmod())