Blame - Lib/statistics.py - platform/external/python/cpython3

blob: 4b054b961141b423c38e96768232f9e3ea96f423 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
				76	Exceptions
				77	----------
				78
				79	A single exception is defined: StatisticsError is a subclass of ValueError.
				80
				81	"""
				82
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	83	__all__ = [
				84	'NormalDist',
				85	'StatisticsError',
				86	'fmean',
				87	'geometric_mean',
				88	'harmonic_mean',
				89	'mean',
				90	'median',
				91	'median_grouped',
				92	'median_high',
				93	'median_low',
				94	'mode',
				95	'multimode',
				96	'pstdev',
				97	'pvariance',
				98	'quantiles',
				99	'stdev',
				100	'variance',
				101	]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	102
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	103	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	104	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	105	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	106
				107	from fractions import Fraction
				108	from decimal import Decimal
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	109	from itertools import groupby, repeat
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	110	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	111	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	112	from operator import itemgetter
				113	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115	# === Exceptions ===
				116
				117	class StatisticsError(ValueError):
				118	pass
				119
				120
				121	# === Private utilities ===
				122
				123	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	124	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	125
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	126	Return a high-precision sum of the given numeric data as a fraction,
				127	together with the type to be converted to and the count of items.
				128
				129	If optional argument ``start`` is given, it is added to the total.
				130	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	131
				132
				133	Examples
				134	--------
				135
				136	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
				139	Some sources of round-off error will be avoided:
				140
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	141	# Built-in sum returns zero.
				142	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	143	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	144
				145	Fractions and Decimals are also supported:
				146
				147	>>> from fractions import Fraction as F
				148	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	149	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	150
				151	>>> from decimal import Decimal as D
				152	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				153	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	154	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	155
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	156	Mixed types are currently treated as an error, except that int is
				157	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	158	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	159	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	160	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	161	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	163	T = _coerce(int, type(start))
				164	for typ, values in groupby(data, type):
				165	T = _coerce(T, typ) # or raise TypeError
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	166	for n, d in map(_exact_ratio, values):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	167	count += 1
				168	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	169	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	170	# The sum will be a NAN or INF. We can ignore all the finite
				171	# partials, and just look at this special one.
				172	total = partials[None]
				173	assert not _isfinite(total)
				174	else:
				175	# Sum all the partial sums using builtin sum.
				176	# FIXME is this faster if we sum them in order of the denominator?
				177	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				178	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	179
				180
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	181	def _isfinite(x):
				182	try:
				183	return x.is_finite() # Likely a Decimal.
				184	except AttributeError:
				185	return math.isfinite(x) # Coerces to float first.
				186
				187
				188	def _coerce(T, S):
				189	"""Coerce types T and S to a common type, or raise TypeError.
				190
				191	Coercion rules are currently an implementation detail. See the CoerceTest
				192	test class in test_statistics for details.
				193	"""
				194	# See http://bugs.python.org/issue24068.
				195	assert T is not bool, "initial type T is bool"
				196	# If the types are the same, no need to coerce anything. Put this
				197	# first, so that the usual case (no coercion needed) happens as soon
				198	# as possible.
				199	if T is S: return T
				200	# Mixed int & other coerce to the other type.
				201	if S is int or S is bool: return T
				202	if T is int: return S
				203	# If one is a (strict) subclass of the other, coerce to the subclass.
				204	if issubclass(S, T): return S
				205	if issubclass(T, S): return T
				206	# Ints coerce to the other type.
				207	if issubclass(T, int): return S
				208	if issubclass(S, int): return T
				209	# Mixed fraction & float coerces to float (or float subclass).
				210	if issubclass(T, Fraction) and issubclass(S, float):
				211	return S
				212	if issubclass(T, float) and issubclass(S, Fraction):
				213	return T
				214	# Any other combination is disallowed.
				215	msg = "don't know how to coerce %s and %s"
				216	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	217
				218
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	219	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	220	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	221
				222	>>> _exact_ratio(0.25)
				223	(1, 4)
				224
				225	x is expected to be an int, Fraction, Decimal or float.
				226	"""
				227	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	228	# Optimise the common case of floats. We expect that the most often
				229	# used numeric type will be builtin floats, so try to make this as
				230	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	231	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	232	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	233	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	235	return (x.numerator, x.denominator)
				236	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	237	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	238	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	239	return x.as_integer_ratio()
				240	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	241	# Just give up?
				242	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	243	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	244	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	245	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	246	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	247	msg = "can't convert type '{}' to numerator/denominator"
				248	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	249
				250
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	251	def _convert(value, T):
				252	"""Convert value to given numeric type T."""
				253	if type(value) is T:
				254	# This covers the cases where T is Fraction, or where value is
				255	# a NAN or INF (Decimal or float).
				256	return value
				257	if issubclass(T, int) and value.denominator != 1:
				258	T = float
				259	try:
				260	# FIXME: what do we do if this overflows?
				261	return T(value)
				262	except TypeError:
				263	if issubclass(T, Decimal):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	264	return T(value.numerator) / T(value.denominator)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	265	else:
				266	raise
				267
				268
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	269	def _find_lteq(a, x):
				270	'Locate the leftmost value exactly equal to x'
				271	i = bisect_left(a, x)
				272	if i != len(a) and a[i] == x:
				273	return i
				274	raise ValueError
				275
				276
				277	def _find_rteq(a, l, x):
				278	'Locate the rightmost value exactly equal to x'
				279	i = bisect_right(a, x, lo=l)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	280	if i != (len(a) + 1) and a[i - 1] == x:
				281	return i - 1
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	282	raise ValueError
				283
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	284
				285	def _fail_neg(values, errmsg='negative value'):
				286	"""Iterate over values, failing if any are less than zero."""
				287	for x in values:
				288	if x < 0:
				289	raise StatisticsError(errmsg)
				290	yield x
				291
				292
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	293	# === Measures of central tendency (averages) ===
				294
				295	def mean(data):
				296	"""Return the sample arithmetic mean of data.
				297
				298	>>> mean([1, 2, 3, 4, 4])
				299	2.8
				300
				301	>>> from fractions import Fraction as F
				302	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				303	Fraction(13, 21)
				304
				305	>>> from decimal import Decimal as D
				306	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				307	Decimal('0.5625')
				308
				309	If ``data`` is empty, StatisticsError will be raised.
				310	"""
				311	if iter(data) is data:
				312	data = list(data)
				313	n = len(data)
				314	if n < 1:
				315	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	316	T, total, count = _sum(data)
				317	assert count == n
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	318	return _convert(total / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	319
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	320
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	321	def fmean(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	322	"""Convert data to floats and compute the arithmetic mean.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	323
				324	This runs faster than the mean() function and it always returns a float.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	325	If the input dataset is empty, it raises a StatisticsError.
				326
				327	>>> fmean([3.5, 4.0, 5.25])
				328	4.25
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	329	"""
				330	try:
				331	n = len(data)
				332	except TypeError:
				333	# Handle iterators that do not define __len__().
				334	n = 0
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	335	def count(iterable):
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	336	nonlocal n
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	337	for n, x in enumerate(iterable, start=1):
				338	yield x
				339	total = fsum(count(data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	340	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	341	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	342	try:
				343	return total / n
				344	except ZeroDivisionError:
				345	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	346
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	347
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	348	def geometric_mean(data):
				349	"""Convert data to floats and compute the geometric mean.
				350
				351	Raises a StatisticsError if the input dataset is empty,
				352	if it contains a zero, or if it contains a negative value.
				353
				354	No special efforts are made to achieve exact results.
				355	(However, this may change in the future.)
				356
				357	>>> round(geometric_mean([54, 24, 36]), 9)
				358	36.0
				359	"""
				360	try:
				361	return exp(fmean(map(log, data)))
				362	except ValueError:
				363	raise StatisticsError('geometric mean requires a non-empty dataset '
				364	' containing positive numbers') from None
				365
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	366
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	367	def harmonic_mean(data, weights=None):
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	368	"""Return the harmonic mean of data.
				369
				370	The harmonic mean, sometimes called the subcontrary mean, is the
				371	reciprocal of the arithmetic mean of the reciprocals of the data,
				372	and is often appropriate when averaging quantities which are rates
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	373	or ratios, for example speeds.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	374
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	375	Suppose a car travels 40 km/hr for 5 km and then speeds-up to
				376	60 km/hr for another 5 km. What is the average speed?
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	377
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	378	>>> harmonic_mean([40, 60])
				379	48.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	380
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	381	Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
				382	speeds-up to 60 km/hr for the remaining 30 km of the journey. What
				383	is the average speed?
				384
				385	>>> harmonic_mean([40, 60], weights=[5, 30])
				386	56.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	387
				388	If ``data`` is empty, or any element is less than zero,
				389	``harmonic_mean`` will raise ``StatisticsError``.
				390	"""
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	391	if iter(data) is data:
				392	data = list(data)
				393	errmsg = 'harmonic mean does not support negative values'
				394	n = len(data)
				395	if n < 1:
				396	raise StatisticsError('harmonic_mean requires at least one data point')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	397	elif n == 1 and weights is None:
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	398	x = data[0]
				399	if isinstance(x, (numbers.Real, Decimal)):
				400	if x < 0:
				401	raise StatisticsError(errmsg)
				402	return x
				403	else:
				404	raise TypeError('unsupported type')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	405	if weights is None:
				406	weights = repeat(1, n)
				407	sum_weights = n
				408	else:
				409	if iter(weights) is weights:
				410	weights = list(weights)
				411	if len(weights) != n:
				412	raise StatisticsError('Number of weights does not match data size')
				413	_, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	414	try:
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	415	data = _fail_neg(data, errmsg)
				416	T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	417	except ZeroDivisionError:
				418	return 0
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	419	if total <= 0:
				420	raise StatisticsError('Weighted sum must be positive')
				421	return _convert(sum_weights / total, T)
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	422
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	423	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				424	def median(data):
				425	"""Return the median (middle value) of numeric data.
				426
				427	When the number of data points is odd, return the middle data point.
				428	When the number of data points is even, the median is interpolated by
				429	taking the average of the two middle values:
				430
				431	>>> median([1, 3, 5])
				432	3
				433	>>> median([1, 3, 5, 7])
				434	4.0
				435
				436	"""
				437	data = sorted(data)
				438	n = len(data)
				439	if n == 0:
				440	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	441	if n % 2 == 1:
				442	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	443	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	444	i = n // 2
				445	return (data[i - 1] + data[i]) / 2
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	446
				447
				448	def median_low(data):
				449	"""Return the low median of numeric data.
				450
				451	When the number of data points is odd, the middle value is returned.
				452	When it is even, the smaller of the two middle values is returned.
				453
				454	>>> median_low([1, 3, 5])
				455	3
				456	>>> median_low([1, 3, 5, 7])
				457	3
				458
				459	"""
				460	data = sorted(data)
				461	n = len(data)
				462	if n == 0:
				463	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	464	if n % 2 == 1:
				465	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	466	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	467	return data[n // 2 - 1]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	468
				469
				470	def median_high(data):
				471	"""Return the high median of data.
				472
				473	When the number of data points is odd, the middle value is returned.
				474	When it is even, the larger of the two middle values is returned.
				475
				476	>>> median_high([1, 3, 5])
				477	3
				478	>>> median_high([1, 3, 5, 7])
				479	5
				480
				481	"""
				482	data = sorted(data)
				483	n = len(data)
				484	if n == 0:
				485	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	486	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	487
				488
				489	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	490	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	491
				492	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				493	3.7
				494	>>> median_grouped([52, 52, 53, 54])
				495	52.5
				496
				497	This calculates the median as the 50th percentile, and should be
				498	used when your data is continuous and grouped. In the above example,
				499	the values 1, 2, 3, etc. actually represent the midpoint of classes
				500	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				501	class 3.5-4.5, and interpolation is used to estimate it.
				502
				503	Optional argument ``interval`` represents the class interval, and
				504	defaults to 1. Changing the class interval naturally will change the
				505	interpolated 50th percentile value:
				506
				507	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				508	3.25
				509	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				510	3.5
				511
				512	This function does not check whether the data points are at least
				513	``interval`` apart.
				514	"""
				515	data = sorted(data)
				516	n = len(data)
				517	if n == 0:
				518	raise StatisticsError("no median for empty data")
				519	elif n == 1:
				520	return data[0]
				521	# Find the value at the midpoint. Remember this corresponds to the
				522	# centre of the class interval.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	523	x = data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	524	for obj in (x, interval):
				525	if isinstance(obj, (str, bytes)):
				526	raise TypeError('expected number but got %r' % obj)
				527	try:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	528	L = x - interval / 2 # The lower limit of the median interval.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	529	except TypeError:
				530	# Mixed type. For now we just coerce to float.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	531	L = float(x) - float(interval) / 2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	532
				533	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	534	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	535	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	536	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	537	# Assuming always l1 <= l2
				538	l2 = _find_rteq(data, l1, x)
				539	cf = l1
				540	f = l2 - l1 + 1
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	541	return L + interval * (n / 2 - cf) / f
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	542
				543
				544	def mode(data):
				545	"""Return the most common data point from discrete or nominal data.
				546
				547	``mode`` assumes discrete data, and returns a single value. This is the
				548	standard treatment of the mode as commonly taught in schools:
				549
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	550	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				551	3
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	552
				553	This also works with nominal (non-numeric) data:
				554
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	555	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				556	'red'
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	557
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	558	If there are multiple modes with same frequency, return the first one
				559	encountered:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	560
				561	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				562	'red'
				563
				564	If data is empty, ``mode``, raises StatisticsError.
				565
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	566	"""
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	567	pairs = Counter(iter(data)).most_common(1)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	568	try:
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	569	return pairs[0][0]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	570	except IndexError:
				571	raise StatisticsError('no mode for empty data') from None
				572
				573
				574	def multimode(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	575	"""Return a list of the most frequently occurring values.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	576
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	577	Will return more than one result if there are multiple modes
				578	or an empty list if data is empty.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	579
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	580	>>> multimode('aabbbbbbbbcc')
				581	['b']
				582	>>> multimode('aabbbbccddddeeffffgg')
				583	['b', 'd', 'f']
				584	>>> multimode('')
				585	[]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	586	"""
				587	counts = Counter(iter(data)).most_common()
				588	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				589	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	590
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	591
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	592	# Notes on methods for computing quantiles
				593	# ----------------------------------------
				594	#
				595	# There is no one perfect way to compute quantiles. Here we offer
				596	# two methods that serve common needs. Most other packages
				597	# surveyed offered at least one or both of these two, making them
				598	# "standard" in the sense of "widely-adopted and reproducible".
				599	# They are also easy to explain, easy to compute manually, and have
				600	# straight-forward interpretations that aren't surprising.
				601
				602	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				603	# value of rank order statistics". The alternative method is known as
				604	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				605
				606	# For sample data where there is a positive probability for values
				607	# beyond the range of the data, the R6 exclusive method is a
				608	# reasonable choice. Consider a random sample of nine values from a
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	609	# population with a uniform distribution from 0.0 to 1.0. The
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	610	# distribution of the third ranked sample point is described by
				611	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				612	# mean=0.300. Only the latter (which corresponds with R6) gives the
				613	# desired cut point with 30% of the population falling below that
				614	# value, making it comparable to a result from an inv_cdf() function.
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	615	# The R6 exclusive method is also idempotent.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	616
				617	# For describing population data where the end points are known to
				618	# be included in the data, the R7 inclusive method is a reasonable
				619	# choice. Instead of the mean, it uses the mode of the beta
				620	# distribution for the interior points. Per Hyndman & Fan, "One nice
				621	# property is that the vertices of Q7(p) divide the range into n - 1
				622	# intervals, and exactly 100p% of the intervals lie to the left of
				623	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				624
Raymond Hettinger	eed5e9a	2019-07-19 01:57:22 -0700	[diff] [blame]	625	# If needed, other methods could be added. However, for now, the
				626	# position is that fewer options make for easier choices and that
				627	# external packages can be used for anything more advanced.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	628
Raymond Hettinger	272d0d0	2019-09-17 20:45:05 -0700	[diff] [blame]	629	def quantiles(data, *, n=4, method='exclusive'):
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	630	"""Divide data into n continuous intervals with equal probability.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	631
				632	Returns a list of (n - 1) cut points separating the intervals.
				633
				634	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				635	Set n to 100 for percentiles which gives the 99 cuts points that
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	636	separate data in to 100 equal sized groups.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	637
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	638	The data can be any iterable containing sample.
				639	The cut points are linearly interpolated between data points.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	640
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	641	If method is set to inclusive, data is treated as population
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	642	data. The minimum value is treated as the 0th percentile and the
				643	maximum value is treated as the 100th percentile.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	644	"""
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	645	if n < 1:
				646	raise StatisticsError('n must be at least 1')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	647	data = sorted(data)
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	648	ld = len(data)
				649	if ld < 2:
				650	raise StatisticsError('must have at least two data points')
				651	if method == 'inclusive':
				652	m = ld - 1
				653	result = []
				654	for i in range(1, n):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	655	j, delta = divmod(i * m, n)
				656	interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	657	result.append(interpolated)
				658	return result
				659	if method == 'exclusive':
				660	m = ld + 1
				661	result = []
				662	for i in range(1, n):
				663	j = i * m // n # rescale i to m/n
				664	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				665	delta = im - jn # exact integer math
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	666	interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	667	result.append(interpolated)
				668	return result
				669	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	670
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	671
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	672	# === Measures of spread ===
				673
				674	# See http://mathworld.wolfram.com/Variance.html
				675	# http://mathworld.wolfram.com/SampleVariance.html
				676	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				677	#
				678	# Under no circumstances use the so-called "computational formula for
				679	# variance", as that is only suitable for hand calculations with a small
				680	# amount of low-precision data. It has terrible numeric properties.
				681	#
				682	# See a comparison of three computational methods here:
				683	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				684
				685	def _ss(data, c=None):
				686	"""Return sum of square deviations of sequence data.
				687
				688	If ``c`` is None, the mean is calculated in one pass, and the deviations
				689	from the mean are calculated in a second pass. Otherwise, deviations are
				690	calculated from ``c`` as given. Use the second case with care, as it can
				691	lead to garbage results.
				692	"""
Raymond Hettinger	d71ab4f	2020-06-13 15:55:52 -0700	[diff] [blame]	693	if c is not None:
				694	T, total, count = _sum((x-c)**2 for x in data)
				695	return (T, total)
				696	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	697	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	698	# The following sum should mathematically equal zero, but due to rounding
				699	# error may not.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	700	U, total2, count2 = _sum((x - c) for x in data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	701	assert T == U and count == count2
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	702	total -= total2 ** 2 / len(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	703	assert not total < 0, 'negative sum of square deviations: %f' % total
				704	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	705
				706
				707	def variance(data, xbar=None):
				708	"""Return the sample variance of data.
				709
				710	data should be an iterable of Real-valued numbers, with at least two
				711	values. The optional argument xbar, if given, should be the mean of
				712	the data. If it is missing or None, the mean is automatically calculated.
				713
				714	Use this function when your data is a sample from a population. To
				715	calculate the variance from the entire population, see ``pvariance``.
				716
				717	Examples:
				718
				719	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				720	>>> variance(data)
				721	1.3720238095238095
				722
				723	If you have already calculated the mean of your data, you can pass it as
				724	the optional second argument ``xbar`` to avoid recalculating it:
				725
				726	>>> m = mean(data)
				727	>>> variance(data, m)
				728	1.3720238095238095
				729
				730	This function does not check that ``xbar`` is actually the mean of
				731	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				732	impossible results.
				733
				734	Decimals and Fractions are supported:
				735
				736	>>> from decimal import Decimal as D
				737	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				738	Decimal('31.01875')
				739
				740	>>> from fractions import Fraction as F
				741	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				742	Fraction(67, 108)
				743
				744	"""
				745	if iter(data) is data:
				746	data = list(data)
				747	n = len(data)
				748	if n < 2:
				749	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	750	T, ss = _ss(data, xbar)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	751	return _convert(ss / (n - 1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	752
				753
				754	def pvariance(data, mu=None):
				755	"""Return the population variance of ``data``.
				756
Raymond Hettinger	733b9a3	2019-11-11 23:35:06 -0800	[diff] [blame]	757	data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	758	value. The optional argument mu, if given, should be the mean of
				759	the data. If it is missing or None, the mean is automatically calculated.
				760
				761	Use this function to calculate the variance from the entire population.
				762	To estimate the variance from a sample, the ``variance`` function is
				763	usually a better choice.
				764
				765	Examples:
				766
				767	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				768	>>> pvariance(data)
				769	1.25
				770
				771	If you have already calculated the mean of the data, you can pass it as
				772	the optional second argument to avoid recalculating it:
				773
				774	>>> mu = mean(data)
				775	>>> pvariance(data, mu)
				776	1.25
				777
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	778	Decimals and Fractions are supported:
				779
				780	>>> from decimal import Decimal as D
				781	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				782	Decimal('24.815')
				783
				784	>>> from fractions import Fraction as F
				785	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				786	Fraction(13, 72)
				787
				788	"""
				789	if iter(data) is data:
				790	data = list(data)
				791	n = len(data)
				792	if n < 1:
				793	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	794	T, ss = _ss(data, mu)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	795	return _convert(ss / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	796
				797
				798	def stdev(data, xbar=None):
				799	"""Return the square root of the sample variance.
				800
				801	See ``variance`` for arguments and other details.
				802
				803	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				804	1.0810874155219827
				805
				806	"""
				807	var = variance(data, xbar)
				808	try:
				809	return var.sqrt()
				810	except AttributeError:
				811	return math.sqrt(var)
				812
				813
				814	def pstdev(data, mu=None):
				815	"""Return the square root of the population variance.
				816
				817	See ``pvariance`` for arguments and other details.
				818
				819	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				820	0.986893273527251
				821
				822	"""
				823	var = pvariance(data, mu)
				824	try:
				825	return var.sqrt()
				826	except AttributeError:
				827	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	828
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	829
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	830	## Normal Distribution #####################################################
				831
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	832
				833	def _normal_dist_inv_cdf(p, mu, sigma):
				834	# There is no closed-form solution to the inverse CDF for the normal
				835	# distribution, so we use a rational approximation instead:
				836	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				837	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				838	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				839	q = p - 0.5
				840	if fabs(q) <= 0.425:
				841	r = 0.180625 - q * q
				842	# Hash sum: 55.88319_28806_14901_4439
				843	num = (((((((2.50908_09287_30122_6727e+3 * r +
				844	3.34305_75583_58812_8105e+4) * r +
				845	6.72657_70927_00870_0853e+4) * r +
				846	4.59219_53931_54987_1457e+4) * r +
				847	1.37316_93765_50946_1125e+4) * r +
				848	1.97159_09503_06551_4427e+3) * r +
				849	1.33141_66789_17843_7745e+2) * r +
				850	3.38713_28727_96366_6080e+0) * q
				851	den = (((((((5.22649_52788_52854_5610e+3 * r +
				852	2.87290_85735_72194_2674e+4) * r +
				853	3.93078_95800_09271_0610e+4) * r +
				854	2.12137_94301_58659_5867e+4) * r +
				855	5.39419_60214_24751_1077e+3) * r +
				856	6.87187_00749_20579_0830e+2) * r +
				857	4.23133_30701_60091_1252e+1) * r +
				858	1.0)
				859	x = num / den
				860	return mu + (x * sigma)
				861	r = p if q <= 0.0 else 1.0 - p
				862	r = sqrt(-log(r))
				863	if r <= 5.0:
				864	r = r - 1.6
				865	# Hash sum: 49.33206_50330_16102_89036
				866	num = (((((((7.74545_01427_83414_07640e-4 * r +
				867	2.27238_44989_26918_45833e-2) * r +
				868	2.41780_72517_74506_11770e-1) * r +
				869	1.27045_82524_52368_38258e+0) * r +
				870	3.64784_83247_63204_60504e+0) * r +
				871	5.76949_72214_60691_40550e+0) * r +
				872	4.63033_78461_56545_29590e+0) * r +
				873	1.42343_71107_49683_57734e+0)
				874	den = (((((((1.05075_00716_44416_84324e-9 * r +
				875	5.47593_80849_95344_94600e-4) * r +
				876	1.51986_66563_61645_71966e-2) * r +
				877	1.48103_97642_74800_74590e-1) * r +
				878	6.89767_33498_51000_04550e-1) * r +
				879	1.67638_48301_83803_84940e+0) * r +
				880	2.05319_16266_37758_82187e+0) * r +
				881	1.0)
				882	else:
				883	r = r - 5.0
				884	# Hash sum: 47.52583_31754_92896_71629
				885	num = (((((((2.01033_43992_92288_13265e-7 * r +
				886	2.71155_55687_43487_57815e-5) * r +
				887	1.24266_09473_88078_43860e-3) * r +
				888	2.65321_89526_57612_30930e-2) * r +
				889	2.96560_57182_85048_91230e-1) * r +
				890	1.78482_65399_17291_33580e+0) * r +
				891	5.46378_49111_64114_36990e+0) * r +
				892	6.65790_46435_01103_77720e+0)
				893	den = (((((((2.04426_31033_89939_78564e-15 * r +
				894	1.42151_17583_16445_88870e-7) * r +
				895	1.84631_83175_10054_68180e-5) * r +
				896	7.86869_13114_56132_59100e-4) * r +
				897	1.48753_61290_85061_48525e-2) * r +
				898	1.36929_88092_27358_05310e-1) * r +
				899	5.99832_20655_58879_37690e-1) * r +
				900	1.0)
				901	x = num / den
				902	if q < 0.0:
				903	x = -x
				904	return mu + (x * sigma)
				905
				906
Raymond Hettinger	0400a7f	2020-05-02 19:30:24 -0700	[diff] [blame]	907	# If available, use C implementation
				908	try:
				909	from _statistics import _normal_dist_inv_cdf
				910	except ImportError:
				911	pass
				912
				913
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	914	class NormalDist:
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	915	"Normal distribution of a random variable"
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	916	# https://en.wikipedia.org/wiki/Normal_distribution
				917	# https://en.wikipedia.org/wiki/Variance#Properties
				918
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	919	__slots__ = {
				920	'_mu': 'Arithmetic mean of a normal distribution',
				921	'_sigma': 'Standard deviation of a normal distribution',
				922	}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	923
				924	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	925	"NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	926	if sigma < 0.0:
				927	raise StatisticsError('sigma must be non-negative')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	928	self._mu = float(mu)
				929	self._sigma = float(sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	930
				931	@classmethod
				932	def from_samples(cls, data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	933	"Make a normal distribution instance from sample data."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	934	if not isinstance(data, (list, tuple)):
				935	data = list(data)
				936	xbar = fmean(data)
				937	return cls(xbar, stdev(data, xbar))
				938
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	939	def samples(self, n, *, seed=None):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	940	"Generate n samples for a given mean and standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	941	gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	942	mu, sigma = self._mu, self._sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	943	return [gauss(mu, sigma) for i in range(n)]
				944
				945	def pdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	946	"Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	947	variance = self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	948	if not variance:
				949	raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	950	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	951
				952	def cdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	953	"Cumulative distribution function. P(X <= x)"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	954	if not self._sigma:
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	955	raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	956	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	957
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	958	def inv_cdf(self, p):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	959	"""Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	960
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	961	Finds the value of the random variable such that the probability of
				962	the variable being less than or equal to that value equals the given
				963	probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	964
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	965	This function is also called the percent point function or quantile
				966	function.
				967	"""
				968	if p <= 0.0 or p >= 1.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	969	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	970	if self._sigma <= 0.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	971	raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	972	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	973
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	974	def quantiles(self, n=4):
				975	"""Divide into n continuous intervals with equal probability.
				976
				977	Returns a list of (n - 1) cut points separating the intervals.
				978
				979	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				980	Set n to 100 for percentiles which gives the 99 cuts points that
				981	separate the normal distribution in to 100 equal sized groups.
				982	"""
				983	return [self.inv_cdf(i / n) for i in range(1, n)]
				984
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	985	def overlap(self, other):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	986	"""Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	987
				988	Measures the agreement between two normal probability distributions.
				989	Returns a value between 0.0 and 1.0 giving the overlapping area in
				990	the two underlying probability density functions.
				991
				992	>>> N1 = NormalDist(2.4, 1.6)
				993	>>> N2 = NormalDist(3.2, 2.0)
				994	>>> N1.overlap(N2)
				995	0.8035050657330205
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	996	"""
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	997	# See: "The overlapping coefficient as a measure of agreement between
				998	# probability distributions and point estimation of the overlap of two
				999	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				1000	# http://dx.doi.org/10.1080/03610928908830127
				1001	if not isinstance(other, NormalDist):
				1002	raise TypeError('Expected another NormalDist instance')
				1003	X, Y = self, other
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	1004	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1005	X, Y = Y, X
				1006	X_var, Y_var = X.variance, Y.variance
				1007	if not X_var or not Y_var:
				1008	raise StatisticsError('overlap() not defined when sigma is zero')
				1009	dv = Y_var - X_var
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1010	dm = fabs(Y._mu - X._mu)
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1011	if not dv:
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1012	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				1013	a = X._mu * Y_var - Y._mu * X_var
				1014	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1015	x1 = (a + b) / dv
				1016	x2 = (a - b) / dv
				1017	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				1018
Raymond Hettinger	70f027d	2020-04-16 10:25:14 -0700	[diff] [blame]	1019	def zscore(self, x):
				1020	"""Compute the Standard Score. (x - mean) / stdev
				1021
				1022	Describes x in terms of the number of standard deviations
				1023	above or below the mean of the normal distribution.
				1024	"""
				1025	# https://www.statisticshowto.com/probability-and-statistics/z-score/
				1026	if not self._sigma:
				1027	raise StatisticsError('zscore() not defined when sigma is zero')
				1028	return (x - self._mu) / self._sigma
				1029
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1030	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1031	def mean(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1032	"Arithmetic mean of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1033	return self._mu
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1034
				1035	@property
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	1036	def median(self):
				1037	"Return the median of the normal distribution"
				1038	return self._mu
				1039
				1040	@property
				1041	def mode(self):
				1042	"""Return the mode of the normal distribution
				1043
				1044	The mode is the value x where which the probability density
				1045	function (pdf) takes its maximum value.
				1046	"""
				1047	return self._mu
				1048
				1049	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1050	def stdev(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1051	"Standard deviation of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1052	return self._sigma
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1053
				1054	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1055	def variance(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1056	"Square of the standard deviation."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1057	return self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1058
				1059	def __add__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1060	"""Add a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1061
				1062	If other is a constant, translate mu by the constant,
				1063	leaving sigma unchanged.
				1064
				1065	If other is a NormalDist, add both the means and the variances.
				1066	Mathematically, this works only if the two distributions are
				1067	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1068	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1069	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1070	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1071	return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1072
				1073	def __sub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1074	"""Subtract a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1075
				1076	If other is a constant, translate by the constant mu,
				1077	leaving sigma unchanged.
				1078
				1079	If other is a NormalDist, subtract the means and add the variances.
				1080	Mathematically, this works only if the two distributions are
				1081	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1082	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1083	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1084	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1085	return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1086
				1087	def __mul__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1088	"""Multiply both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1089
				1090	Used for rescaling, perhaps to change measurement units.
				1091	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1092	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1093	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1094
				1095	def __truediv__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1096	"""Divide both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1097
				1098	Used for rescaling, perhaps to change measurement units.
				1099	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1100	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1101	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1102
				1103	def __pos__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1104	"Return a copy of the instance."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1105	return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1106
				1107	def __neg__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1108	"Negates mu while keeping sigma the same."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1109	return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1110
				1111	__radd__ = __add__
				1112
				1113	def __rsub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1114	"Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1115	return -(x1 - x2)
				1116
				1117	__rmul__ = __mul__
				1118
				1119	def __eq__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1120	"Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1121	if not isinstance(x2, NormalDist):
				1122	return NotImplemented
Raymond Hettinger	5eabec0	2019-10-18 14:20:35 -0700	[diff] [blame]	1123	return x1._mu == x2._mu and x1._sigma == x2._sigma
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1124
				1125	def __hash__(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1126	"NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1127	return hash((self._mu, self._sigma))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1128
				1129	def __repr__(self):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1130	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'