Blame - Lib/statistics.py - platform/external/python/cpython3

blob: 673a162b3a79fc047c8423642006d705e06b0099 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	76	Statistics for relations between two inputs
				77	-------------------------------------------
				78
				79	================== ====================================================
				80	Function Description
				81	================== ====================================================
				82	covariance Sample covariance for two variables.
				83	correlation Pearson's correlation coefficient for two variables.
				84	linear_regression Intercept and slope for simple linear regression.
				85	================== ====================================================
				86
				87	Calculate covariance, Pearson's correlation, and simple linear regression
				88	for two inputs:
				89
				90	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				91	>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
				92	>>> covariance(x, y)
				93	0.75
				94	>>> correlation(x, y) #doctest: +ELLIPSIS
				95	0.31622776601...
				96	>>> linear_regression(x, y) #doctest:
				97	LinearRegression(intercept=1.5, slope=0.1)
				98
				99
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	100	Exceptions
				101	----------
				102
				103	A single exception is defined: StatisticsError is a subclass of ValueError.
				104
				105	"""
				106
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	107	__all__ = [
				108	'NormalDist',
				109	'StatisticsError',
				110	'fmean',
				111	'geometric_mean',
				112	'harmonic_mean',
				113	'mean',
				114	'median',
				115	'median_grouped',
				116	'median_high',
				117	'median_low',
				118	'mode',
				119	'multimode',
				120	'pstdev',
				121	'pvariance',
				122	'quantiles',
				123	'stdev',
				124	'variance',
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	125	'correlation',
				126	'covariance',
				127	'linear_regression',
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	128	]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	129
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	130	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	131	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	132	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	133
				134	from fractions import Fraction
				135	from decimal import Decimal
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	136	from itertools import groupby, repeat
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	137	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	138	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	139	from operator import itemgetter
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	140	from collections import Counter, namedtuple
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	141
				142	# === Exceptions ===
				143
				144	class StatisticsError(ValueError):
				145	pass
				146
				147
				148	# === Private utilities ===
				149
				150	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	151	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	152
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	153	Return a high-precision sum of the given numeric data as a fraction,
				154	together with the type to be converted to and the count of items.
				155
				156	If optional argument ``start`` is given, it is added to the total.
				157	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	158
				159
				160	Examples
				161	--------
				162
				163	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	164	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	165
				166	Some sources of round-off error will be avoided:
				167
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	168	# Built-in sum returns zero.
				169	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	170	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	171
				172	Fractions and Decimals are also supported:
				173
				174	>>> from fractions import Fraction as F
				175	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	176	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	177
				178	>>> from decimal import Decimal as D
				179	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				180	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	181	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	182
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	183	Mixed types are currently treated as an error, except that int is
				184	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	185	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	186	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	187	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	188	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	189	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	190	T = _coerce(int, type(start))
				191	for typ, values in groupby(data, type):
				192	T = _coerce(T, typ) # or raise TypeError
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	193	for n, d in map(_exact_ratio, values):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	194	count += 1
				195	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	196	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	197	# The sum will be a NAN or INF. We can ignore all the finite
				198	# partials, and just look at this special one.
				199	total = partials[None]
				200	assert not _isfinite(total)
				201	else:
				202	# Sum all the partial sums using builtin sum.
				203	# FIXME is this faster if we sum them in order of the denominator?
				204	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				205	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	206
				207
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	208	def _isfinite(x):
				209	try:
				210	return x.is_finite() # Likely a Decimal.
				211	except AttributeError:
				212	return math.isfinite(x) # Coerces to float first.
				213
				214
				215	def _coerce(T, S):
				216	"""Coerce types T and S to a common type, or raise TypeError.
				217
				218	Coercion rules are currently an implementation detail. See the CoerceTest
				219	test class in test_statistics for details.
				220	"""
				221	# See http://bugs.python.org/issue24068.
				222	assert T is not bool, "initial type T is bool"
				223	# If the types are the same, no need to coerce anything. Put this
				224	# first, so that the usual case (no coercion needed) happens as soon
				225	# as possible.
				226	if T is S: return T
				227	# Mixed int & other coerce to the other type.
				228	if S is int or S is bool: return T
				229	if T is int: return S
				230	# If one is a (strict) subclass of the other, coerce to the subclass.
				231	if issubclass(S, T): return S
				232	if issubclass(T, S): return T
				233	# Ints coerce to the other type.
				234	if issubclass(T, int): return S
				235	if issubclass(S, int): return T
				236	# Mixed fraction & float coerces to float (or float subclass).
				237	if issubclass(T, Fraction) and issubclass(S, float):
				238	return S
				239	if issubclass(T, float) and issubclass(S, Fraction):
				240	return T
				241	# Any other combination is disallowed.
				242	msg = "don't know how to coerce %s and %s"
				243	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	244
				245
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	246	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	247	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	248
				249	>>> _exact_ratio(0.25)
				250	(1, 4)
				251
				252	x is expected to be an int, Fraction, Decimal or float.
				253	"""
				254	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	255	# Optimise the common case of floats. We expect that the most often
				256	# used numeric type will be builtin floats, so try to make this as
				257	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	258	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	259	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	260	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	261	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	262	return (x.numerator, x.denominator)
				263	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	264	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	265	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	266	return x.as_integer_ratio()
				267	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	268	# Just give up?
				269	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	270	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	271	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	272	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	273	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	274	msg = "can't convert type '{}' to numerator/denominator"
				275	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	276
				277
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	278	def _convert(value, T):
				279	"""Convert value to given numeric type T."""
				280	if type(value) is T:
				281	# This covers the cases where T is Fraction, or where value is
				282	# a NAN or INF (Decimal or float).
				283	return value
				284	if issubclass(T, int) and value.denominator != 1:
				285	T = float
				286	try:
				287	# FIXME: what do we do if this overflows?
				288	return T(value)
				289	except TypeError:
				290	if issubclass(T, Decimal):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	291	return T(value.numerator) / T(value.denominator)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	292	else:
				293	raise
				294
				295
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	296	def _find_lteq(a, x):
				297	'Locate the leftmost value exactly equal to x'
				298	i = bisect_left(a, x)
				299	if i != len(a) and a[i] == x:
				300	return i
				301	raise ValueError
				302
				303
				304	def _find_rteq(a, l, x):
				305	'Locate the rightmost value exactly equal to x'
				306	i = bisect_right(a, x, lo=l)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	307	if i != (len(a) + 1) and a[i - 1] == x:
				308	return i - 1
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	309	raise ValueError
				310
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	311
				312	def _fail_neg(values, errmsg='negative value'):
				313	"""Iterate over values, failing if any are less than zero."""
				314	for x in values:
				315	if x < 0:
				316	raise StatisticsError(errmsg)
				317	yield x
				318
				319
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	320	# === Measures of central tendency (averages) ===
				321
				322	def mean(data):
				323	"""Return the sample arithmetic mean of data.
				324
				325	>>> mean([1, 2, 3, 4, 4])
				326	2.8
				327
				328	>>> from fractions import Fraction as F
				329	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				330	Fraction(13, 21)
				331
				332	>>> from decimal import Decimal as D
				333	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				334	Decimal('0.5625')
				335
				336	If ``data`` is empty, StatisticsError will be raised.
				337	"""
				338	if iter(data) is data:
				339	data = list(data)
				340	n = len(data)
				341	if n < 1:
				342	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	343	T, total, count = _sum(data)
				344	assert count == n
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	345	return _convert(total / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	346
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	347
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	348	def fmean(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	349	"""Convert data to floats and compute the arithmetic mean.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	350
				351	This runs faster than the mean() function and it always returns a float.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	352	If the input dataset is empty, it raises a StatisticsError.
				353
				354	>>> fmean([3.5, 4.0, 5.25])
				355	4.25
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	356	"""
				357	try:
				358	n = len(data)
				359	except TypeError:
				360	# Handle iterators that do not define __len__().
				361	n = 0
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	362	def count(iterable):
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	363	nonlocal n
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	364	for n, x in enumerate(iterable, start=1):
				365	yield x
				366	total = fsum(count(data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	367	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	368	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	369	try:
				370	return total / n
				371	except ZeroDivisionError:
				372	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	373
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	374
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	375	def geometric_mean(data):
				376	"""Convert data to floats and compute the geometric mean.
				377
				378	Raises a StatisticsError if the input dataset is empty,
				379	if it contains a zero, or if it contains a negative value.
				380
				381	No special efforts are made to achieve exact results.
				382	(However, this may change in the future.)
				383
				384	>>> round(geometric_mean([54, 24, 36]), 9)
				385	36.0
				386	"""
				387	try:
				388	return exp(fmean(map(log, data)))
				389	except ValueError:
				390	raise StatisticsError('geometric mean requires a non-empty dataset '
				391	' containing positive numbers') from None
				392
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	393
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	394	def harmonic_mean(data, weights=None):
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	395	"""Return the harmonic mean of data.
				396
Raymond Hettinger	30a8b28	2021-02-07 16:44:42 -0800	[diff] [blame]	397	The harmonic mean is the reciprocal of the arithmetic mean of the
				398	reciprocals of the data. It can be used for averaging ratios or
				399	rates, for example speeds.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	400
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	401	Suppose a car travels 40 km/hr for 5 km and then speeds-up to
				402	60 km/hr for another 5 km. What is the average speed?
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	403
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	404	>>> harmonic_mean([40, 60])
				405	48.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	406
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	407	Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
				408	speeds-up to 60 km/hr for the remaining 30 km of the journey. What
				409	is the average speed?
				410
				411	>>> harmonic_mean([40, 60], weights=[5, 30])
				412	56.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	413
				414	If ``data`` is empty, or any element is less than zero,
				415	``harmonic_mean`` will raise ``StatisticsError``.
				416	"""
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	417	if iter(data) is data:
				418	data = list(data)
				419	errmsg = 'harmonic mean does not support negative values'
				420	n = len(data)
				421	if n < 1:
				422	raise StatisticsError('harmonic_mean requires at least one data point')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	423	elif n == 1 and weights is None:
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	424	x = data[0]
				425	if isinstance(x, (numbers.Real, Decimal)):
				426	if x < 0:
				427	raise StatisticsError(errmsg)
				428	return x
				429	else:
				430	raise TypeError('unsupported type')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	431	if weights is None:
				432	weights = repeat(1, n)
				433	sum_weights = n
				434	else:
				435	if iter(weights) is weights:
				436	weights = list(weights)
				437	if len(weights) != n:
				438	raise StatisticsError('Number of weights does not match data size')
				439	_, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	440	try:
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	441	data = _fail_neg(data, errmsg)
				442	T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	443	except ZeroDivisionError:
				444	return 0
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	445	if total <= 0:
				446	raise StatisticsError('Weighted sum must be positive')
				447	return _convert(sum_weights / total, T)
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	448
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	449	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				450	def median(data):
				451	"""Return the median (middle value) of numeric data.
				452
				453	When the number of data points is odd, return the middle data point.
				454	When the number of data points is even, the median is interpolated by
				455	taking the average of the two middle values:
				456
				457	>>> median([1, 3, 5])
				458	3
				459	>>> median([1, 3, 5, 7])
				460	4.0
				461
				462	"""
				463	data = sorted(data)
				464	n = len(data)
				465	if n == 0:
				466	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	467	if n % 2 == 1:
				468	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	469	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	470	i = n // 2
				471	return (data[i - 1] + data[i]) / 2
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	472
				473
				474	def median_low(data):
				475	"""Return the low median of numeric data.
				476
				477	When the number of data points is odd, the middle value is returned.
				478	When it is even, the smaller of the two middle values is returned.
				479
				480	>>> median_low([1, 3, 5])
				481	3
				482	>>> median_low([1, 3, 5, 7])
				483	3
				484
				485	"""
				486	data = sorted(data)
				487	n = len(data)
				488	if n == 0:
				489	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	490	if n % 2 == 1:
				491	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	492	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	493	return data[n // 2 - 1]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	494
				495
				496	def median_high(data):
				497	"""Return the high median of data.
				498
				499	When the number of data points is odd, the middle value is returned.
				500	When it is even, the larger of the two middle values is returned.
				501
				502	>>> median_high([1, 3, 5])
				503	3
				504	>>> median_high([1, 3, 5, 7])
				505	5
				506
				507	"""
				508	data = sorted(data)
				509	n = len(data)
				510	if n == 0:
				511	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	512	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	513
				514
				515	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	516	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	517
				518	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				519	3.7
				520	>>> median_grouped([52, 52, 53, 54])
				521	52.5
				522
				523	This calculates the median as the 50th percentile, and should be
				524	used when your data is continuous and grouped. In the above example,
				525	the values 1, 2, 3, etc. actually represent the midpoint of classes
				526	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				527	class 3.5-4.5, and interpolation is used to estimate it.
				528
				529	Optional argument ``interval`` represents the class interval, and
				530	defaults to 1. Changing the class interval naturally will change the
				531	interpolated 50th percentile value:
				532
				533	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				534	3.25
				535	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				536	3.5
				537
				538	This function does not check whether the data points are at least
				539	``interval`` apart.
				540	"""
				541	data = sorted(data)
				542	n = len(data)
				543	if n == 0:
				544	raise StatisticsError("no median for empty data")
				545	elif n == 1:
				546	return data[0]
				547	# Find the value at the midpoint. Remember this corresponds to the
				548	# centre of the class interval.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	549	x = data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	550	for obj in (x, interval):
				551	if isinstance(obj, (str, bytes)):
				552	raise TypeError('expected number but got %r' % obj)
				553	try:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	554	L = x - interval / 2 # The lower limit of the median interval.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	555	except TypeError:
				556	# Mixed type. For now we just coerce to float.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	557	L = float(x) - float(interval) / 2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	558
				559	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	560	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	561	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	562	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	563	# Assuming always l1 <= l2
				564	l2 = _find_rteq(data, l1, x)
				565	cf = l1
				566	f = l2 - l1 + 1
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	567	return L + interval * (n / 2 - cf) / f
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	568
				569
				570	def mode(data):
				571	"""Return the most common data point from discrete or nominal data.
				572
				573	``mode`` assumes discrete data, and returns a single value. This is the
				574	standard treatment of the mode as commonly taught in schools:
				575
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	576	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				577	3
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	578
				579	This also works with nominal (non-numeric) data:
				580
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	581	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				582	'red'
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	583
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	584	If there are multiple modes with same frequency, return the first one
				585	encountered:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	586
				587	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				588	'red'
				589
				590	If data is empty, ``mode``, raises StatisticsError.
				591
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	592	"""
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	593	pairs = Counter(iter(data)).most_common(1)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	594	try:
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	595	return pairs[0][0]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	596	except IndexError:
				597	raise StatisticsError('no mode for empty data') from None
				598
				599
				600	def multimode(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	601	"""Return a list of the most frequently occurring values.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	602
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	603	Will return more than one result if there are multiple modes
				604	or an empty list if data is empty.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	605
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	606	>>> multimode('aabbbbbbbbcc')
				607	['b']
				608	>>> multimode('aabbbbccddddeeffffgg')
				609	['b', 'd', 'f']
				610	>>> multimode('')
				611	[]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	612	"""
				613	counts = Counter(iter(data)).most_common()
				614	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				615	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	616
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	617
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	618	# Notes on methods for computing quantiles
				619	# ----------------------------------------
				620	#
				621	# There is no one perfect way to compute quantiles. Here we offer
				622	# two methods that serve common needs. Most other packages
				623	# surveyed offered at least one or both of these two, making them
				624	# "standard" in the sense of "widely-adopted and reproducible".
				625	# They are also easy to explain, easy to compute manually, and have
				626	# straight-forward interpretations that aren't surprising.
				627
				628	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				629	# value of rank order statistics". The alternative method is known as
				630	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				631
				632	# For sample data where there is a positive probability for values
				633	# beyond the range of the data, the R6 exclusive method is a
				634	# reasonable choice. Consider a random sample of nine values from a
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	635	# population with a uniform distribution from 0.0 to 1.0. The
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	636	# distribution of the third ranked sample point is described by
				637	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				638	# mean=0.300. Only the latter (which corresponds with R6) gives the
				639	# desired cut point with 30% of the population falling below that
				640	# value, making it comparable to a result from an inv_cdf() function.
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	641	# The R6 exclusive method is also idempotent.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	642
				643	# For describing population data where the end points are known to
				644	# be included in the data, the R7 inclusive method is a reasonable
				645	# choice. Instead of the mean, it uses the mode of the beta
				646	# distribution for the interior points. Per Hyndman & Fan, "One nice
				647	# property is that the vertices of Q7(p) divide the range into n - 1
				648	# intervals, and exactly 100p% of the intervals lie to the left of
				649	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				650
Raymond Hettinger	eed5e9a	2019-07-19 01:57:22 -0700	[diff] [blame]	651	# If needed, other methods could be added. However, for now, the
				652	# position is that fewer options make for easier choices and that
				653	# external packages can be used for anything more advanced.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	654
Raymond Hettinger	272d0d0	2019-09-17 20:45:05 -0700	[diff] [blame]	655	def quantiles(data, *, n=4, method='exclusive'):
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	656	"""Divide data into n continuous intervals with equal probability.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	657
				658	Returns a list of (n - 1) cut points separating the intervals.
				659
				660	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				661	Set n to 100 for percentiles which gives the 99 cuts points that
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	662	separate data in to 100 equal sized groups.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	663
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	664	The data can be any iterable containing sample.
				665	The cut points are linearly interpolated between data points.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	666
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	667	If method is set to inclusive, data is treated as population
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	668	data. The minimum value is treated as the 0th percentile and the
				669	maximum value is treated as the 100th percentile.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	670	"""
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	671	if n < 1:
				672	raise StatisticsError('n must be at least 1')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	673	data = sorted(data)
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	674	ld = len(data)
				675	if ld < 2:
				676	raise StatisticsError('must have at least two data points')
				677	if method == 'inclusive':
				678	m = ld - 1
				679	result = []
				680	for i in range(1, n):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	681	j, delta = divmod(i * m, n)
				682	interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	683	result.append(interpolated)
				684	return result
				685	if method == 'exclusive':
				686	m = ld + 1
				687	result = []
				688	for i in range(1, n):
				689	j = i * m // n # rescale i to m/n
				690	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				691	delta = im - jn # exact integer math
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	692	interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	693	result.append(interpolated)
				694	return result
				695	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	696
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	697
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	698	# === Measures of spread ===
				699
				700	# See http://mathworld.wolfram.com/Variance.html
				701	# http://mathworld.wolfram.com/SampleVariance.html
				702	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				703	#
				704	# Under no circumstances use the so-called "computational formula for
				705	# variance", as that is only suitable for hand calculations with a small
				706	# amount of low-precision data. It has terrible numeric properties.
				707	#
				708	# See a comparison of three computational methods here:
				709	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				710
				711	def _ss(data, c=None):
				712	"""Return sum of square deviations of sequence data.
				713
				714	If ``c`` is None, the mean is calculated in one pass, and the deviations
				715	from the mean are calculated in a second pass. Otherwise, deviations are
				716	calculated from ``c`` as given. Use the second case with care, as it can
				717	lead to garbage results.
				718	"""
Raymond Hettinger	d71ab4f	2020-06-13 15:55:52 -0700	[diff] [blame]	719	if c is not None:
				720	T, total, count = _sum((x-c)**2 for x in data)
				721	return (T, total)
				722	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	723	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	724	# The following sum should mathematically equal zero, but due to rounding
				725	# error may not.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	726	U, total2, count2 = _sum((x - c) for x in data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	727	assert T == U and count == count2
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	728	total -= total2 ** 2 / len(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	729	assert not total < 0, 'negative sum of square deviations: %f' % total
				730	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	731
				732
				733	def variance(data, xbar=None):
				734	"""Return the sample variance of data.
				735
				736	data should be an iterable of Real-valued numbers, with at least two
				737	values. The optional argument xbar, if given, should be the mean of
				738	the data. If it is missing or None, the mean is automatically calculated.
				739
				740	Use this function when your data is a sample from a population. To
				741	calculate the variance from the entire population, see ``pvariance``.
				742
				743	Examples:
				744
				745	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				746	>>> variance(data)
				747	1.3720238095238095
				748
				749	If you have already calculated the mean of your data, you can pass it as
				750	the optional second argument ``xbar`` to avoid recalculating it:
				751
				752	>>> m = mean(data)
				753	>>> variance(data, m)
				754	1.3720238095238095
				755
				756	This function does not check that ``xbar`` is actually the mean of
				757	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				758	impossible results.
				759
				760	Decimals and Fractions are supported:
				761
				762	>>> from decimal import Decimal as D
				763	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				764	Decimal('31.01875')
				765
				766	>>> from fractions import Fraction as F
				767	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				768	Fraction(67, 108)
				769
				770	"""
				771	if iter(data) is data:
				772	data = list(data)
				773	n = len(data)
				774	if n < 2:
				775	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	776	T, ss = _ss(data, xbar)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	777	return _convert(ss / (n - 1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	778
				779
				780	def pvariance(data, mu=None):
				781	"""Return the population variance of ``data``.
				782
Raymond Hettinger	733b9a3	2019-11-11 23:35:06 -0800	[diff] [blame]	783	data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	784	value. The optional argument mu, if given, should be the mean of
				785	the data. If it is missing or None, the mean is automatically calculated.
				786
				787	Use this function to calculate the variance from the entire population.
				788	To estimate the variance from a sample, the ``variance`` function is
				789	usually a better choice.
				790
				791	Examples:
				792
				793	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				794	>>> pvariance(data)
				795	1.25
				796
				797	If you have already calculated the mean of the data, you can pass it as
				798	the optional second argument to avoid recalculating it:
				799
				800	>>> mu = mean(data)
				801	>>> pvariance(data, mu)
				802	1.25
				803
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	804	Decimals and Fractions are supported:
				805
				806	>>> from decimal import Decimal as D
				807	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				808	Decimal('24.815')
				809
				810	>>> from fractions import Fraction as F
				811	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				812	Fraction(13, 72)
				813
				814	"""
				815	if iter(data) is data:
				816	data = list(data)
				817	n = len(data)
				818	if n < 1:
				819	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	820	T, ss = _ss(data, mu)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	821	return _convert(ss / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	822
				823
				824	def stdev(data, xbar=None):
				825	"""Return the square root of the sample variance.
				826
				827	See ``variance`` for arguments and other details.
				828
				829	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				830	1.0810874155219827
				831
				832	"""
				833	var = variance(data, xbar)
				834	try:
				835	return var.sqrt()
				836	except AttributeError:
				837	return math.sqrt(var)
				838
				839
				840	def pstdev(data, mu=None):
				841	"""Return the square root of the population variance.
				842
				843	See ``pvariance`` for arguments and other details.
				844
				845	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				846	0.986893273527251
				847
				848	"""
				849	var = pvariance(data, mu)
				850	try:
				851	return var.sqrt()
				852	except AttributeError:
				853	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	854
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	855
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	856	# === Statistics for relations between two inputs ===
				857
				858	# See https://en.wikipedia.org/wiki/Covariance
				859	# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
				860	# https://en.wikipedia.org/wiki/Simple_linear_regression
				861
				862
				863	def covariance(x, y, /):
				864	"""Covariance
				865
				866	Return the sample covariance of two inputs x and y. Covariance
				867	is a measure of the joint variability of two inputs.
				868
				869	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				870	>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
				871	>>> covariance(x, y)
				872	0.75
				873	>>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
				874	>>> covariance(x, z)
				875	-7.5
				876	>>> covariance(z, x)
				877	-7.5
				878
				879	"""
				880	n = len(x)
				881	if len(y) != n:
				882	raise StatisticsError('covariance requires that both inputs have same number of data points')
				883	if n < 2:
				884	raise StatisticsError('covariance requires at least two data points')
				885	xbar = mean(x)
				886	ybar = mean(y)
				887	total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
				888	return total / (n - 1)
				889
				890
				891	def correlation(x, y, /):
				892	"""Pearson's correlation coefficient
				893
				894	Return the Pearson's correlation coefficient for two inputs. Pearson's
				895	correlation coefficient r takes values between -1 and +1. It measures the
				896	strength and direction of the linear relationship, where +1 means very
				897	strong, positive linear relationship, -1 very strong, negative linear
				898	relationship, and 0 no linear relationship.
				899
				900	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				901	>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
				902	>>> correlation(x, x)
				903	1.0
				904	>>> correlation(x, y)
				905	-1.0
				906
				907	"""
				908	n = len(x)
				909	if len(y) != n:
				910	raise StatisticsError('correlation requires that both inputs have same number of data points')
				911	if n < 2:
				912	raise StatisticsError('correlation requires at least two data points')
				913	cov = covariance(x, y)
				914	stdx = stdev(x)
				915	stdy = stdev(y)
				916	try:
				917	return cov / (stdx * stdy)
				918	except ZeroDivisionError:
				919	raise StatisticsError('at least one of the inputs is constant')
				920
				921
				922	LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope'])
				923
				924
				925	def linear_regression(regressor, dependent_variable, /):
				926	"""Intercept and slope for simple linear regression
				927
				928	Return the intercept and slope of simple linear regression
				929	parameters estimated using ordinary least squares. Simple linear
				930	regression describes relationship between regressor and
				931	dependent variable in terms of linear function::
				932
				933	dependent_variable = intercept + slope * regressor + noise
				934
				935	where ``intercept`` and ``slope`` are the regression parameters that are
				936	estimated, and noise term is an unobserved random variable, for the
				937	variability of the data that was not explained by the linear regression
				938	(it is equal to the difference between prediction and the actual values
				939	of dependent variable).
				940
				941	The parameters are returned as a named tuple.
				942
				943	>>> regressor = [1, 2, 3, 4, 5]
				944	>>> noise = NormalDist().samples(5, seed=42)
				945	>>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)]
				946	>>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS
				947	LinearRegression(intercept=1.75684970486..., slope=3.09078914170...)
				948
				949	"""
				950	n = len(regressor)
				951	if len(dependent_variable) != n:
				952	raise StatisticsError('linear regression requires that both inputs have same number of data points')
				953	if n < 2:
				954	raise StatisticsError('linear regression requires at least two data points')
				955	try:
				956	slope = covariance(regressor, dependent_variable) / variance(regressor)
				957	except ZeroDivisionError:
				958	raise StatisticsError('regressor is constant')
				959	intercept = mean(dependent_variable) - slope * mean(regressor)
				960	return LinearRegression(intercept=intercept, slope=slope)
				961
				962
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	963	## Normal Distribution #####################################################
				964
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	965
				966	def _normal_dist_inv_cdf(p, mu, sigma):
				967	# There is no closed-form solution to the inverse CDF for the normal
				968	# distribution, so we use a rational approximation instead:
				969	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				970	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				971	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				972	q = p - 0.5
				973	if fabs(q) <= 0.425:
				974	r = 0.180625 - q * q
				975	# Hash sum: 55.88319_28806_14901_4439
				976	num = (((((((2.50908_09287_30122_6727e+3 * r +
				977	3.34305_75583_58812_8105e+4) * r +
				978	6.72657_70927_00870_0853e+4) * r +
				979	4.59219_53931_54987_1457e+4) * r +
				980	1.37316_93765_50946_1125e+4) * r +
				981	1.97159_09503_06551_4427e+3) * r +
				982	1.33141_66789_17843_7745e+2) * r +
				983	3.38713_28727_96366_6080e+0) * q
				984	den = (((((((5.22649_52788_52854_5610e+3 * r +
				985	2.87290_85735_72194_2674e+4) * r +
				986	3.93078_95800_09271_0610e+4) * r +
				987	2.12137_94301_58659_5867e+4) * r +
				988	5.39419_60214_24751_1077e+3) * r +
				989	6.87187_00749_20579_0830e+2) * r +
				990	4.23133_30701_60091_1252e+1) * r +
				991	1.0)
				992	x = num / den
				993	return mu + (x * sigma)
				994	r = p if q <= 0.0 else 1.0 - p
				995	r = sqrt(-log(r))
				996	if r <= 5.0:
				997	r = r - 1.6
				998	# Hash sum: 49.33206_50330_16102_89036
				999	num = (((((((7.74545_01427_83414_07640e-4 * r +
				1000	2.27238_44989_26918_45833e-2) * r +
				1001	2.41780_72517_74506_11770e-1) * r +
				1002	1.27045_82524_52368_38258e+0) * r +
				1003	3.64784_83247_63204_60504e+0) * r +
				1004	5.76949_72214_60691_40550e+0) * r +
				1005	4.63033_78461_56545_29590e+0) * r +
				1006	1.42343_71107_49683_57734e+0)
				1007	den = (((((((1.05075_00716_44416_84324e-9 * r +
				1008	5.47593_80849_95344_94600e-4) * r +
				1009	1.51986_66563_61645_71966e-2) * r +
				1010	1.48103_97642_74800_74590e-1) * r +
				1011	6.89767_33498_51000_04550e-1) * r +
				1012	1.67638_48301_83803_84940e+0) * r +
				1013	2.05319_16266_37758_82187e+0) * r +
				1014	1.0)
				1015	else:
				1016	r = r - 5.0
				1017	# Hash sum: 47.52583_31754_92896_71629
				1018	num = (((((((2.01033_43992_92288_13265e-7 * r +
				1019	2.71155_55687_43487_57815e-5) * r +
				1020	1.24266_09473_88078_43860e-3) * r +
				1021	2.65321_89526_57612_30930e-2) * r +
				1022	2.96560_57182_85048_91230e-1) * r +
				1023	1.78482_65399_17291_33580e+0) * r +
				1024	5.46378_49111_64114_36990e+0) * r +
				1025	6.65790_46435_01103_77720e+0)
				1026	den = (((((((2.04426_31033_89939_78564e-15 * r +
				1027	1.42151_17583_16445_88870e-7) * r +
				1028	1.84631_83175_10054_68180e-5) * r +
				1029	7.86869_13114_56132_59100e-4) * r +
				1030	1.48753_61290_85061_48525e-2) * r +
				1031	1.36929_88092_27358_05310e-1) * r +
				1032	5.99832_20655_58879_37690e-1) * r +
				1033	1.0)
				1034	x = num / den
				1035	if q < 0.0:
				1036	x = -x
				1037	return mu + (x * sigma)
				1038
				1039
Raymond Hettinger	0400a7f	2020-05-02 19:30:24 -0700	[diff] [blame]	1040	# If available, use C implementation
				1041	try:
				1042	from _statistics import _normal_dist_inv_cdf
				1043	except ImportError:
				1044	pass
				1045
				1046
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1047	class NormalDist:
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1048	"Normal distribution of a random variable"
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1049	# https://en.wikipedia.org/wiki/Normal_distribution
				1050	# https://en.wikipedia.org/wiki/Variance#Properties
				1051
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1052	__slots__ = {
				1053	'_mu': 'Arithmetic mean of a normal distribution',
				1054	'_sigma': 'Standard deviation of a normal distribution',
				1055	}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1056
				1057	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1058	"NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1059	if sigma < 0.0:
				1060	raise StatisticsError('sigma must be non-negative')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	1061	self._mu = float(mu)
				1062	self._sigma = float(sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1063
				1064	@classmethod
				1065	def from_samples(cls, data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1066	"Make a normal distribution instance from sample data."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1067	if not isinstance(data, (list, tuple)):
				1068	data = list(data)
				1069	xbar = fmean(data)
				1070	return cls(xbar, stdev(data, xbar))
				1071
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	1072	def samples(self, n, *, seed=None):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1073	"Generate n samples for a given mean and standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1074	gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1075	mu, sigma = self._mu, self._sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1076	return [gauss(mu, sigma) for i in range(n)]
				1077
				1078	def pdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1079	"Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1080	variance = self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1081	if not variance:
				1082	raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1083	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1084
				1085	def cdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1086	"Cumulative distribution function. P(X <= x)"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1087	if not self._sigma:
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1088	raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1089	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1090
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1091	def inv_cdf(self, p):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1092	"""Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1093
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1094	Finds the value of the random variable such that the probability of
				1095	the variable being less than or equal to that value equals the given
				1096	probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1097
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1098	This function is also called the percent point function or quantile
				1099	function.
				1100	"""
				1101	if p <= 0.0 or p >= 1.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1102	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1103	if self._sigma <= 0.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1104	raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	1105	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1106
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	1107	def quantiles(self, n=4):
				1108	"""Divide into n continuous intervals with equal probability.
				1109
				1110	Returns a list of (n - 1) cut points separating the intervals.
				1111
				1112	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				1113	Set n to 100 for percentiles which gives the 99 cuts points that
				1114	separate the normal distribution in to 100 equal sized groups.
				1115	"""
				1116	return [self.inv_cdf(i / n) for i in range(1, n)]
				1117
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1118	def overlap(self, other):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1119	"""Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1120
				1121	Measures the agreement between two normal probability distributions.
				1122	Returns a value between 0.0 and 1.0 giving the overlapping area in
				1123	the two underlying probability density functions.
				1124
				1125	>>> N1 = NormalDist(2.4, 1.6)
				1126	>>> N2 = NormalDist(3.2, 2.0)
				1127	>>> N1.overlap(N2)
				1128	0.8035050657330205
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1129	"""
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1130	# See: "The overlapping coefficient as a measure of agreement between
				1131	# probability distributions and point estimation of the overlap of two
				1132	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				1133	# http://dx.doi.org/10.1080/03610928908830127
				1134	if not isinstance(other, NormalDist):
				1135	raise TypeError('Expected another NormalDist instance')
				1136	X, Y = self, other
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	1137	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1138	X, Y = Y, X
				1139	X_var, Y_var = X.variance, Y.variance
				1140	if not X_var or not Y_var:
				1141	raise StatisticsError('overlap() not defined when sigma is zero')
				1142	dv = Y_var - X_var
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1143	dm = fabs(Y._mu - X._mu)
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1144	if not dv:
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1145	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				1146	a = X._mu * Y_var - Y._mu * X_var
				1147	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1148	x1 = (a + b) / dv
				1149	x2 = (a - b) / dv
				1150	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				1151
Raymond Hettinger	70f027d	2020-04-16 10:25:14 -0700	[diff] [blame]	1152	def zscore(self, x):
				1153	"""Compute the Standard Score. (x - mean) / stdev
				1154
				1155	Describes x in terms of the number of standard deviations
				1156	above or below the mean of the normal distribution.
				1157	"""
				1158	# https://www.statisticshowto.com/probability-and-statistics/z-score/
				1159	if not self._sigma:
				1160	raise StatisticsError('zscore() not defined when sigma is zero')
				1161	return (x - self._mu) / self._sigma
				1162
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1163	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1164	def mean(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1165	"Arithmetic mean of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1166	return self._mu
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1167
				1168	@property
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	1169	def median(self):
				1170	"Return the median of the normal distribution"
				1171	return self._mu
				1172
				1173	@property
				1174	def mode(self):
				1175	"""Return the mode of the normal distribution
				1176
				1177	The mode is the value x where which the probability density
				1178	function (pdf) takes its maximum value.
				1179	"""
				1180	return self._mu
				1181
				1182	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1183	def stdev(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1184	"Standard deviation of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1185	return self._sigma
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1186
				1187	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1188	def variance(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1189	"Square of the standard deviation."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1190	return self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1191
				1192	def __add__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1193	"""Add a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1194
				1195	If other is a constant, translate mu by the constant,
				1196	leaving sigma unchanged.
				1197
				1198	If other is a NormalDist, add both the means and the variances.
				1199	Mathematically, this works only if the two distributions are
				1200	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1201	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1202	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1203	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1204	return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1205
				1206	def __sub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1207	"""Subtract a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1208
				1209	If other is a constant, translate by the constant mu,
				1210	leaving sigma unchanged.
				1211
				1212	If other is a NormalDist, subtract the means and add the variances.
				1213	Mathematically, this works only if the two distributions are
				1214	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1215	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1216	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1217	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1218	return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1219
				1220	def __mul__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1221	"""Multiply both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1222
				1223	Used for rescaling, perhaps to change measurement units.
				1224	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1225	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1226	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1227
				1228	def __truediv__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1229	"""Divide both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1230
				1231	Used for rescaling, perhaps to change measurement units.
				1232	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1233	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1234	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1235
				1236	def __pos__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1237	"Return a copy of the instance."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1238	return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1239
				1240	def __neg__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1241	"Negates mu while keeping sigma the same."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1242	return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1243
				1244	__radd__ = __add__
				1245
				1246	def __rsub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1247	"Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1248	return -(x1 - x2)
				1249
				1250	__rmul__ = __mul__
				1251
				1252	def __eq__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1253	"Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1254	if not isinstance(x2, NormalDist):
				1255	return NotImplemented
Raymond Hettinger	5eabec0	2019-10-18 14:20:35 -0700	[diff] [blame]	1256	return x1._mu == x2._mu and x1._sigma == x2._sigma
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1257
				1258	def __hash__(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1259	"NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1260	return hash((self._mu, self._sigma))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1261
				1262	def __repr__(self):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1263	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'