Blame - Lib/statistics.py - platform/external/python/cpython3

blob: cfcc456fd786efafb0f732020956ff052d4b4aa1 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	76	Statistics for relations between two inputs
				77	-------------------------------------------
				78
				79	================== ====================================================
				80	Function Description
				81	================== ====================================================
				82	covariance Sample covariance for two variables.
				83	correlation Pearson's correlation coefficient for two variables.
				84	linear_regression Intercept and slope for simple linear regression.
				85	================== ====================================================
				86
				87	Calculate covariance, Pearson's correlation, and simple linear regression
				88	for two inputs:
				89
				90	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				91	>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
				92	>>> covariance(x, y)
				93	0.75
				94	>>> correlation(x, y) #doctest: +ELLIPSIS
				95	0.31622776601...
				96	>>> linear_regression(x, y) #doctest:
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	97	LinearRegression(slope=0.1, intercept=1.5)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	98
				99
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	100	Exceptions
				101	----------
				102
				103	A single exception is defined: StatisticsError is a subclass of ValueError.
				104
				105	"""
				106
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	107	__all__ = [
				108	'NormalDist',
				109	'StatisticsError',
Miss Islington (bot)	5442cfa	2021-06-04 18:49:29 -0700	[diff] [blame]	110	'correlation',
				111	'covariance',
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	112	'fmean',
				113	'geometric_mean',
				114	'harmonic_mean',
Miss Islington (bot)	5442cfa	2021-06-04 18:49:29 -0700	[diff] [blame]	115	'linear_regression',
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	116	'mean',
				117	'median',
				118	'median_grouped',
				119	'median_high',
				120	'median_low',
				121	'mode',
				122	'multimode',
				123	'pstdev',
				124	'pvariance',
				125	'quantiles',
				126	'stdev',
				127	'variance',
				128	]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	129
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	130	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	131	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	132	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	133
				134	from fractions import Fraction
				135	from decimal import Decimal
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	136	from itertools import groupby, repeat
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	137	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	138	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	139	from operator import itemgetter
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	140	from collections import Counter, namedtuple
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	141
				142	# === Exceptions ===
				143
				144	class StatisticsError(ValueError):
				145	pass
				146
				147
				148	# === Private utilities ===
				149
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	150	def _sum(data):
				151	"""_sum(data) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	152
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	153	Return a high-precision sum of the given numeric data as a fraction,
				154	together with the type to be converted to and the count of items.
				155
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	156	Examples
				157	--------
				158
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	159	>>> _sum([3, 2.25, 4.5, -0.5, 0.25])
				160	(<class 'float'>, Fraction(19, 2), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	161
				162	Some sources of round-off error will be avoided:
				163
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	164	# Built-in sum returns zero.
				165	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	166	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	167
				168	Fractions and Decimals are also supported:
				169
				170	>>> from fractions import Fraction as F
				171	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	172	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	173
				174	>>> from decimal import Decimal as D
				175	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				176	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	177	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	178
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	179	Mixed types are currently treated as an error, except that int is
				180	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	181	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	182	count = 0
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	183	partials = {}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	184	partials_get = partials.get
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	185	T = int
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	186	for typ, values in groupby(data, type):
				187	T = _coerce(T, typ) # or raise TypeError
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	188	for n, d in map(_exact_ratio, values):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	189	count += 1
				190	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	191	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	192	# The sum will be a NAN or INF. We can ignore all the finite
				193	# partials, and just look at this special one.
				194	total = partials[None]
				195	assert not _isfinite(total)
				196	else:
				197	# Sum all the partial sums using builtin sum.
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	198	total = sum(Fraction(n, d) for d, n in partials.items())
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	199	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	200
				201
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	202	def _isfinite(x):
				203	try:
				204	return x.is_finite() # Likely a Decimal.
				205	except AttributeError:
				206	return math.isfinite(x) # Coerces to float first.
				207
				208
				209	def _coerce(T, S):
				210	"""Coerce types T and S to a common type, or raise TypeError.
				211
				212	Coercion rules are currently an implementation detail. See the CoerceTest
				213	test class in test_statistics for details.
				214	"""
				215	# See http://bugs.python.org/issue24068.
				216	assert T is not bool, "initial type T is bool"
				217	# If the types are the same, no need to coerce anything. Put this
				218	# first, so that the usual case (no coercion needed) happens as soon
				219	# as possible.
				220	if T is S: return T
				221	# Mixed int & other coerce to the other type.
				222	if S is int or S is bool: return T
				223	if T is int: return S
				224	# If one is a (strict) subclass of the other, coerce to the subclass.
				225	if issubclass(S, T): return S
				226	if issubclass(T, S): return T
				227	# Ints coerce to the other type.
				228	if issubclass(T, int): return S
				229	if issubclass(S, int): return T
				230	# Mixed fraction & float coerces to float (or float subclass).
				231	if issubclass(T, Fraction) and issubclass(S, float):
				232	return S
				233	if issubclass(T, float) and issubclass(S, Fraction):
				234	return T
				235	# Any other combination is disallowed.
				236	msg = "don't know how to coerce %s and %s"
				237	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	238
				239
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	240	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	241	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	242
				243	>>> _exact_ratio(0.25)
				244	(1, 4)
				245
				246	x is expected to be an int, Fraction, Decimal or float.
				247	"""
				248	try:
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	249	return x.as_integer_ratio()
				250	except AttributeError:
				251	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	252	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	253	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	254	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	255	return (x, None)
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	256	try:
				257	# x may be an Integral ABC.
				258	return (x.numerator, x.denominator)
				259	except AttributeError:
				260	msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
				261	raise TypeError(msg)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	262
				263
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	264	def _convert(value, T):
				265	"""Convert value to given numeric type T."""
				266	if type(value) is T:
				267	# This covers the cases where T is Fraction, or where value is
				268	# a NAN or INF (Decimal or float).
				269	return value
				270	if issubclass(T, int) and value.denominator != 1:
				271	T = float
				272	try:
				273	# FIXME: what do we do if this overflows?
				274	return T(value)
				275	except TypeError:
				276	if issubclass(T, Decimal):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	277	return T(value.numerator) / T(value.denominator)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	278	else:
				279	raise
				280
				281
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	282	def _find_lteq(a, x):
				283	'Locate the leftmost value exactly equal to x'
				284	i = bisect_left(a, x)
				285	if i != len(a) and a[i] == x:
				286	return i
				287	raise ValueError
				288
				289
				290	def _find_rteq(a, l, x):
				291	'Locate the rightmost value exactly equal to x'
				292	i = bisect_right(a, x, lo=l)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	293	if i != (len(a) + 1) and a[i - 1] == x:
				294	return i - 1
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	295	raise ValueError
				296
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	297
				298	def _fail_neg(values, errmsg='negative value'):
				299	"""Iterate over values, failing if any are less than zero."""
				300	for x in values:
				301	if x < 0:
				302	raise StatisticsError(errmsg)
				303	yield x
				304
				305
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	306	# === Measures of central tendency (averages) ===
				307
				308	def mean(data):
				309	"""Return the sample arithmetic mean of data.
				310
				311	>>> mean([1, 2, 3, 4, 4])
				312	2.8
				313
				314	>>> from fractions import Fraction as F
				315	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				316	Fraction(13, 21)
				317
				318	>>> from decimal import Decimal as D
				319	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				320	Decimal('0.5625')
				321
				322	If ``data`` is empty, StatisticsError will be raised.
				323	"""
				324	if iter(data) is data:
				325	data = list(data)
				326	n = len(data)
				327	if n < 1:
				328	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	329	T, total, count = _sum(data)
				330	assert count == n
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	331	return _convert(total / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	332
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	333
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	334	def fmean(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	335	"""Convert data to floats and compute the arithmetic mean.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	336
				337	This runs faster than the mean() function and it always returns a float.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	338	If the input dataset is empty, it raises a StatisticsError.
				339
				340	>>> fmean([3.5, 4.0, 5.25])
				341	4.25
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	342	"""
				343	try:
				344	n = len(data)
				345	except TypeError:
				346	# Handle iterators that do not define __len__().
				347	n = 0
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	348	def count(iterable):
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	349	nonlocal n
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	350	for n, x in enumerate(iterable, start=1):
				351	yield x
				352	total = fsum(count(data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	353	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	354	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	355	try:
				356	return total / n
				357	except ZeroDivisionError:
				358	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	359
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	360
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	361	def geometric_mean(data):
				362	"""Convert data to floats and compute the geometric mean.
				363
				364	Raises a StatisticsError if the input dataset is empty,
				365	if it contains a zero, or if it contains a negative value.
				366
				367	No special efforts are made to achieve exact results.
				368	(However, this may change in the future.)
				369
				370	>>> round(geometric_mean([54, 24, 36]), 9)
				371	36.0
				372	"""
				373	try:
				374	return exp(fmean(map(log, data)))
				375	except ValueError:
				376	raise StatisticsError('geometric mean requires a non-empty dataset '
				377	' containing positive numbers') from None
				378
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	379
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	380	def harmonic_mean(data, weights=None):
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	381	"""Return the harmonic mean of data.
				382
Raymond Hettinger	30a8b28	2021-02-07 16:44:42 -0800	[diff] [blame]	383	The harmonic mean is the reciprocal of the arithmetic mean of the
				384	reciprocals of the data. It can be used for averaging ratios or
				385	rates, for example speeds.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	386
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	387	Suppose a car travels 40 km/hr for 5 km and then speeds-up to
				388	60 km/hr for another 5 km. What is the average speed?
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	389
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	390	>>> harmonic_mean([40, 60])
				391	48.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	392
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	393	Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
				394	speeds-up to 60 km/hr for the remaining 30 km of the journey. What
				395	is the average speed?
				396
				397	>>> harmonic_mean([40, 60], weights=[5, 30])
				398	56.0
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	399
				400	If ``data`` is empty, or any element is less than zero,
				401	``harmonic_mean`` will raise ``StatisticsError``.
				402	"""
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	403	if iter(data) is data:
				404	data = list(data)
				405	errmsg = 'harmonic mean does not support negative values'
				406	n = len(data)
				407	if n < 1:
				408	raise StatisticsError('harmonic_mean requires at least one data point')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	409	elif n == 1 and weights is None:
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	410	x = data[0]
				411	if isinstance(x, (numbers.Real, Decimal)):
				412	if x < 0:
				413	raise StatisticsError(errmsg)
				414	return x
				415	else:
				416	raise TypeError('unsupported type')
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	417	if weights is None:
				418	weights = repeat(1, n)
				419	sum_weights = n
				420	else:
				421	if iter(weights) is weights:
				422	weights = list(weights)
				423	if len(weights) != n:
				424	raise StatisticsError('Number of weights does not match data size')
				425	_, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	426	try:
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	427	data = _fail_neg(data, errmsg)
				428	T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	429	except ZeroDivisionError:
				430	return 0
Raymond Hettinger	cc3467a	2020-12-23 19:52:09 -0800	[diff] [blame]	431	if total <= 0:
				432	raise StatisticsError('Weighted sum must be positive')
				433	return _convert(sum_weights / total, T)
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	434
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	435	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				436	def median(data):
				437	"""Return the median (middle value) of numeric data.
				438
				439	When the number of data points is odd, return the middle data point.
				440	When the number of data points is even, the median is interpolated by
				441	taking the average of the two middle values:
				442
				443	>>> median([1, 3, 5])
				444	3
				445	>>> median([1, 3, 5, 7])
				446	4.0
				447
				448	"""
				449	data = sorted(data)
				450	n = len(data)
				451	if n == 0:
				452	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	453	if n % 2 == 1:
				454	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	455	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	456	i = n // 2
				457	return (data[i - 1] + data[i]) / 2
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	458
				459
				460	def median_low(data):
				461	"""Return the low median of numeric data.
				462
				463	When the number of data points is odd, the middle value is returned.
				464	When it is even, the smaller of the two middle values is returned.
				465
				466	>>> median_low([1, 3, 5])
				467	3
				468	>>> median_low([1, 3, 5, 7])
				469	3
				470
				471	"""
				472	data = sorted(data)
				473	n = len(data)
				474	if n == 0:
				475	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	476	if n % 2 == 1:
				477	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	478	else:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	479	return data[n // 2 - 1]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	480
				481
				482	def median_high(data):
				483	"""Return the high median of data.
				484
				485	When the number of data points is odd, the middle value is returned.
				486	When it is even, the larger of the two middle values is returned.
				487
				488	>>> median_high([1, 3, 5])
				489	3
				490	>>> median_high([1, 3, 5, 7])
				491	5
				492
				493	"""
				494	data = sorted(data)
				495	n = len(data)
				496	if n == 0:
				497	raise StatisticsError("no median for empty data")
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	498	return data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	499
				500
				501	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	502	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	503
				504	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				505	3.7
				506	>>> median_grouped([52, 52, 53, 54])
				507	52.5
				508
				509	This calculates the median as the 50th percentile, and should be
				510	used when your data is continuous and grouped. In the above example,
				511	the values 1, 2, 3, etc. actually represent the midpoint of classes
				512	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				513	class 3.5-4.5, and interpolation is used to estimate it.
				514
				515	Optional argument ``interval`` represents the class interval, and
				516	defaults to 1. Changing the class interval naturally will change the
				517	interpolated 50th percentile value:
				518
				519	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				520	3.25
				521	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				522	3.5
				523
				524	This function does not check whether the data points are at least
				525	``interval`` apart.
				526	"""
				527	data = sorted(data)
				528	n = len(data)
				529	if n == 0:
				530	raise StatisticsError("no median for empty data")
				531	elif n == 1:
				532	return data[0]
				533	# Find the value at the midpoint. Remember this corresponds to the
				534	# centre of the class interval.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	535	x = data[n // 2]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	536	for obj in (x, interval):
				537	if isinstance(obj, (str, bytes)):
				538	raise TypeError('expected number but got %r' % obj)
				539	try:
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	540	L = x - interval / 2 # The lower limit of the median interval.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	541	except TypeError:
				542	# Mixed type. For now we just coerce to float.
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	543	L = float(x) - float(interval) / 2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	544
				545	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	546	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	547	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	548	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	549	# Assuming always l1 <= l2
				550	l2 = _find_rteq(data, l1, x)
				551	cf = l1
				552	f = l2 - l1 + 1
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	553	return L + interval * (n / 2 - cf) / f
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	554
				555
				556	def mode(data):
				557	"""Return the most common data point from discrete or nominal data.
				558
				559	``mode`` assumes discrete data, and returns a single value. This is the
				560	standard treatment of the mode as commonly taught in schools:
				561
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	562	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				563	3
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	564
				565	This also works with nominal (non-numeric) data:
				566
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	567	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				568	'red'
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	569
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	570	If there are multiple modes with same frequency, return the first one
				571	encountered:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	572
				573	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				574	'red'
				575
				576	If data is empty, ``mode``, raises StatisticsError.
				577
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	578	"""
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	579	pairs = Counter(iter(data)).most_common(1)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	580	try:
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	581	return pairs[0][0]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	582	except IndexError:
				583	raise StatisticsError('no mode for empty data') from None
				584
				585
				586	def multimode(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	587	"""Return a list of the most frequently occurring values.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	588
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	589	Will return more than one result if there are multiple modes
				590	or an empty list if data is empty.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	591
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	592	>>> multimode('aabbbbbbbbcc')
				593	['b']
				594	>>> multimode('aabbbbccddddeeffffgg')
				595	['b', 'd', 'f']
				596	>>> multimode('')
				597	[]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	598	"""
				599	counts = Counter(iter(data)).most_common()
				600	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				601	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	602
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	603
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	604	# Notes on methods for computing quantiles
				605	# ----------------------------------------
				606	#
				607	# There is no one perfect way to compute quantiles. Here we offer
				608	# two methods that serve common needs. Most other packages
				609	# surveyed offered at least one or both of these two, making them
				610	# "standard" in the sense of "widely-adopted and reproducible".
				611	# They are also easy to explain, easy to compute manually, and have
				612	# straight-forward interpretations that aren't surprising.
				613
				614	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				615	# value of rank order statistics". The alternative method is known as
				616	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				617
				618	# For sample data where there is a positive probability for values
				619	# beyond the range of the data, the R6 exclusive method is a
				620	# reasonable choice. Consider a random sample of nine values from a
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	621	# population with a uniform distribution from 0.0 to 1.0. The
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	622	# distribution of the third ranked sample point is described by
				623	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				624	# mean=0.300. Only the latter (which corresponds with R6) gives the
				625	# desired cut point with 30% of the population falling below that
				626	# value, making it comparable to a result from an inv_cdf() function.
Raymond Hettinger	7ce4bfa	2019-09-20 21:46:52 -0700	[diff] [blame]	627	# The R6 exclusive method is also idempotent.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	628
				629	# For describing population data where the end points are known to
				630	# be included in the data, the R7 inclusive method is a reasonable
				631	# choice. Instead of the mean, it uses the mode of the beta
				632	# distribution for the interior points. Per Hyndman & Fan, "One nice
				633	# property is that the vertices of Q7(p) divide the range into n - 1
				634	# intervals, and exactly 100p% of the intervals lie to the left of
				635	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				636
Raymond Hettinger	eed5e9a	2019-07-19 01:57:22 -0700	[diff] [blame]	637	# If needed, other methods could be added. However, for now, the
				638	# position is that fewer options make for easier choices and that
				639	# external packages can be used for anything more advanced.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	640
Raymond Hettinger	272d0d0	2019-09-17 20:45:05 -0700	[diff] [blame]	641	def quantiles(data, *, n=4, method='exclusive'):
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	642	"""Divide data into n continuous intervals with equal probability.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	643
				644	Returns a list of (n - 1) cut points separating the intervals.
				645
				646	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				647	Set n to 100 for percentiles which gives the 99 cuts points that
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	648	separate data in to 100 equal sized groups.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	649
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	650	The data can be any iterable containing sample.
				651	The cut points are linearly interpolated between data points.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	652
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	653	If method is set to inclusive, data is treated as population
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	654	data. The minimum value is treated as the 0th percentile and the
				655	maximum value is treated as the 100th percentile.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	656	"""
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	657	if n < 1:
				658	raise StatisticsError('n must be at least 1')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	659	data = sorted(data)
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	660	ld = len(data)
				661	if ld < 2:
				662	raise StatisticsError('must have at least two data points')
				663	if method == 'inclusive':
				664	m = ld - 1
				665	result = []
				666	for i in range(1, n):
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	667	j, delta = divmod(i * m, n)
				668	interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	669	result.append(interpolated)
				670	return result
				671	if method == 'exclusive':
				672	m = ld + 1
				673	result = []
				674	for i in range(1, n):
				675	j = i * m // n # rescale i to m/n
				676	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				677	delta = im - jn # exact integer math
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	678	interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	679	result.append(interpolated)
				680	return result
				681	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	682
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	683
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	684	# === Measures of spread ===
				685
				686	# See http://mathworld.wolfram.com/Variance.html
				687	# http://mathworld.wolfram.com/SampleVariance.html
				688	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				689	#
				690	# Under no circumstances use the so-called "computational formula for
				691	# variance", as that is only suitable for hand calculations with a small
				692	# amount of low-precision data. It has terrible numeric properties.
				693	#
				694	# See a comparison of three computational methods here:
				695	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				696
				697	def _ss(data, c=None):
				698	"""Return sum of square deviations of sequence data.
				699
				700	If ``c`` is None, the mean is calculated in one pass, and the deviations
				701	from the mean are calculated in a second pass. Otherwise, deviations are
				702	calculated from ``c`` as given. Use the second case with care, as it can
				703	lead to garbage results.
				704	"""
Raymond Hettinger	d71ab4f	2020-06-13 15:55:52 -0700	[diff] [blame]	705	if c is not None:
				706	T, total, count = _sum((x-c)**2 for x in data)
				707	return (T, total)
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	708	T, total, count = _sum(data)
				709	mean_n, mean_d = (total / count).as_integer_ratio()
				710	partials = Counter()
				711	for n, d in map(_exact_ratio, data):
				712	diff_n = n * mean_d - d * mean_n
				713	diff_d = d * mean_d
				714	partials[diff_d * diff_d] += diff_n * diff_n
				715	if None in partials:
				716	# The sum will be a NAN or INF. We can ignore all the finite
				717	# partials, and just look at this special one.
				718	total = partials[None]
				719	assert not _isfinite(total)
				720	else:
				721	total = sum(Fraction(n, d) for d, n in partials.items())
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	722	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	723
				724
				725	def variance(data, xbar=None):
				726	"""Return the sample variance of data.
				727
				728	data should be an iterable of Real-valued numbers, with at least two
				729	values. The optional argument xbar, if given, should be the mean of
				730	the data. If it is missing or None, the mean is automatically calculated.
				731
				732	Use this function when your data is a sample from a population. To
				733	calculate the variance from the entire population, see ``pvariance``.
				734
				735	Examples:
				736
				737	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				738	>>> variance(data)
				739	1.3720238095238095
				740
				741	If you have already calculated the mean of your data, you can pass it as
				742	the optional second argument ``xbar`` to avoid recalculating it:
				743
				744	>>> m = mean(data)
				745	>>> variance(data, m)
				746	1.3720238095238095
				747
				748	This function does not check that ``xbar`` is actually the mean of
				749	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				750	impossible results.
				751
				752	Decimals and Fractions are supported:
				753
				754	>>> from decimal import Decimal as D
				755	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				756	Decimal('31.01875')
				757
				758	>>> from fractions import Fraction as F
				759	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				760	Fraction(67, 108)
				761
				762	"""
				763	if iter(data) is data:
				764	data = list(data)
				765	n = len(data)
				766	if n < 2:
				767	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	768	T, ss = _ss(data, xbar)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	769	return _convert(ss / (n - 1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	770
				771
				772	def pvariance(data, mu=None):
				773	"""Return the population variance of ``data``.
				774
Raymond Hettinger	733b9a3	2019-11-11 23:35:06 -0800	[diff] [blame]	775	data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	776	value. The optional argument mu, if given, should be the mean of
				777	the data. If it is missing or None, the mean is automatically calculated.
				778
				779	Use this function to calculate the variance from the entire population.
				780	To estimate the variance from a sample, the ``variance`` function is
				781	usually a better choice.
				782
				783	Examples:
				784
				785	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				786	>>> pvariance(data)
				787	1.25
				788
				789	If you have already calculated the mean of the data, you can pass it as
				790	the optional second argument to avoid recalculating it:
				791
				792	>>> mu = mean(data)
				793	>>> pvariance(data, mu)
				794	1.25
				795
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	796	Decimals and Fractions are supported:
				797
				798	>>> from decimal import Decimal as D
				799	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				800	Decimal('24.815')
				801
				802	>>> from fractions import Fraction as F
				803	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				804	Fraction(13, 72)
				805
				806	"""
				807	if iter(data) is data:
				808	data = list(data)
				809	n = len(data)
				810	if n < 1:
				811	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	812	T, ss = _ss(data, mu)
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	813	return _convert(ss / n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	814
				815
				816	def stdev(data, xbar=None):
				817	"""Return the square root of the sample variance.
				818
				819	See ``variance`` for arguments and other details.
				820
				821	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				822	1.0810874155219827
				823
				824	"""
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	825	# Fixme: Despite the exact sum of squared deviations, some inaccuracy
				826	# remain because there are two rounding steps. The first occurs in
				827	# the _convert() step for variance(), the second occurs in math.sqrt().
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	828	var = variance(data, xbar)
				829	try:
				830	return var.sqrt()
				831	except AttributeError:
				832	return math.sqrt(var)
				833
				834
				835	def pstdev(data, mu=None):
				836	"""Return the square root of the population variance.
				837
				838	See ``pvariance`` for arguments and other details.
				839
				840	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				841	0.986893273527251
				842
				843	"""
Raymond Hettinger	3c30805	2021-09-08 22:42:29 -0500	[diff] [blame]	844	# Fixme: Despite the exact sum of squared deviations, some inaccuracy
				845	# remain because there are two rounding steps. The first occurs in
				846	# the _convert() step for pvariance(), the second occurs in math.sqrt().
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	847	var = pvariance(data, mu)
				848	try:
				849	return var.sqrt()
				850	except AttributeError:
				851	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	852
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	853
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	854	# === Statistics for relations between two inputs ===
				855
				856	# See https://en.wikipedia.org/wiki/Covariance
				857	# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
				858	# https://en.wikipedia.org/wiki/Simple_linear_regression
				859
				860
				861	def covariance(x, y, /):
				862	"""Covariance
				863
				864	Return the sample covariance of two inputs x and y. Covariance
				865	is a measure of the joint variability of two inputs.
				866
				867	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				868	>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
				869	>>> covariance(x, y)
				870	0.75
				871	>>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
				872	>>> covariance(x, z)
				873	-7.5
				874	>>> covariance(z, x)
				875	-7.5
				876
				877	"""
				878	n = len(x)
				879	if len(y) != n:
				880	raise StatisticsError('covariance requires that both inputs have same number of data points')
				881	if n < 2:
				882	raise StatisticsError('covariance requires at least two data points')
Miss Islington (bot)	5442cfa	2021-06-04 18:49:29 -0700	[diff] [blame]	883	xbar = fsum(x) / n
				884	ybar = fsum(y) / n
				885	sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
				886	return sxy / (n - 1)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	887
				888
				889	def correlation(x, y, /):
				890	"""Pearson's correlation coefficient
				891
				892	Return the Pearson's correlation coefficient for two inputs. Pearson's
				893	correlation coefficient r takes values between -1 and +1. It measures the
				894	strength and direction of the linear relationship, where +1 means very
				895	strong, positive linear relationship, -1 very strong, negative linear
				896	relationship, and 0 no linear relationship.
				897
				898	>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
				899	>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
				900	>>> correlation(x, x)
				901	1.0
				902	>>> correlation(x, y)
				903	-1.0
				904
				905	"""
				906	n = len(x)
				907	if len(y) != n:
				908	raise StatisticsError('correlation requires that both inputs have same number of data points')
				909	if n < 2:
				910	raise StatisticsError('correlation requires at least two data points')
Miss Islington (bot)	5442cfa	2021-06-04 18:49:29 -0700	[diff] [blame]	911	xbar = fsum(x) / n
				912	ybar = fsum(y) / n
				913	sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
Miss Islington (bot)	4642caf	2021-06-04 19:38:30 -0700	[diff] [blame]	914	sxx = fsum((xi - xbar) ** 2.0 for xi in x)
				915	syy = fsum((yi - ybar) ** 2.0 for yi in y)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	916	try:
Miss Islington (bot)	4642caf	2021-06-04 19:38:30 -0700	[diff] [blame]	917	return sxy / sqrt(sxx * syy)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	918	except ZeroDivisionError:
				919	raise StatisticsError('at least one of the inputs is constant')
				920
				921
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	922	LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	923
				924
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	925	def linear_regression(x, y, /):
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	926	"""Slope and intercept for simple linear regression.
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	927
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	928	Return the slope and intercept of simple linear regression
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	929	parameters estimated using ordinary least squares. Simple linear
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	930	regression describes relationship between an independent variable
				931	x and a dependent variable y in terms of linear function:
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	932
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	933	y = slope * x + intercept + noise
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	934
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	935	where slope and intercept are the regression parameters that are
Miss Islington (bot)	e6755ba	2021-05-16 19:47:57 -0700	[diff] [blame]	936	estimated, and noise represents the variability of the data that was
				937	not explained by the linear regression (it is equal to the
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	938	difference between predicted and actual values of the dependent
Miss Islington (bot)	e6755ba	2021-05-16 19:47:57 -0700	[diff] [blame]	939	variable).
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	940
				941	The parameters are returned as a named tuple.
				942
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	943	>>> x = [1, 2, 3, 4, 5]
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	944	>>> noise = NormalDist().samples(5, seed=42)
Miss Islington (bot)	a682519	2021-05-24 23:23:10 -0700	[diff] [blame]	945	>>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	946	>>> linear_regression(x, y) #doctest: +ELLIPSIS
				947	LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	948
				949	"""
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	950	n = len(x)
				951	if len(y) != n:
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	952	raise StatisticsError('linear regression requires that both inputs have same number of data points')
				953	if n < 2:
				954	raise StatisticsError('linear regression requires at least two data points')
Miss Islington (bot)	8e3cb61	2021-05-06 08:26:55 -0700	[diff] [blame]	955	xbar = fsum(x) / n
				956	ybar = fsum(y) / n
				957	sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
Miss Islington (bot)	4642caf	2021-06-04 19:38:30 -0700	[diff] [blame]	958	sxx = fsum((xi - xbar) ** 2.0 for xi in x)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	959	try:
Miss Islington (bot)	4642caf	2021-06-04 19:38:30 -0700	[diff] [blame]	960	slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	961	except ZeroDivisionError:
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	962	raise StatisticsError('x is constant')
Miss Islington (bot)	8e3cb61	2021-05-06 08:26:55 -0700	[diff] [blame]	963	intercept = ybar - slope * xbar
Miss Islington (bot)	8677987	2021-05-24 18:11:12 -0700	[diff] [blame]	964	return LinearRegression(slope=slope, intercept=intercept)
Tymoteusz Wołodźko	09aa6f9	2021-04-25 13:45:09 +0200	[diff] [blame]	965
				966
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	967	## Normal Distribution #####################################################
				968
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	969
				970	def _normal_dist_inv_cdf(p, mu, sigma):
				971	# There is no closed-form solution to the inverse CDF for the normal
				972	# distribution, so we use a rational approximation instead:
				973	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				974	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				975	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				976	q = p - 0.5
				977	if fabs(q) <= 0.425:
				978	r = 0.180625 - q * q
				979	# Hash sum: 55.88319_28806_14901_4439
				980	num = (((((((2.50908_09287_30122_6727e+3 * r +
				981	3.34305_75583_58812_8105e+4) * r +
				982	6.72657_70927_00870_0853e+4) * r +
				983	4.59219_53931_54987_1457e+4) * r +
				984	1.37316_93765_50946_1125e+4) * r +
				985	1.97159_09503_06551_4427e+3) * r +
				986	1.33141_66789_17843_7745e+2) * r +
				987	3.38713_28727_96366_6080e+0) * q
				988	den = (((((((5.22649_52788_52854_5610e+3 * r +
				989	2.87290_85735_72194_2674e+4) * r +
				990	3.93078_95800_09271_0610e+4) * r +
				991	2.12137_94301_58659_5867e+4) * r +
				992	5.39419_60214_24751_1077e+3) * r +
				993	6.87187_00749_20579_0830e+2) * r +
				994	4.23133_30701_60091_1252e+1) * r +
				995	1.0)
				996	x = num / den
				997	return mu + (x * sigma)
				998	r = p if q <= 0.0 else 1.0 - p
				999	r = sqrt(-log(r))
				1000	if r <= 5.0:
				1001	r = r - 1.6
				1002	# Hash sum: 49.33206_50330_16102_89036
				1003	num = (((((((7.74545_01427_83414_07640e-4 * r +
				1004	2.27238_44989_26918_45833e-2) * r +
				1005	2.41780_72517_74506_11770e-1) * r +
				1006	1.27045_82524_52368_38258e+0) * r +
				1007	3.64784_83247_63204_60504e+0) * r +
				1008	5.76949_72214_60691_40550e+0) * r +
				1009	4.63033_78461_56545_29590e+0) * r +
				1010	1.42343_71107_49683_57734e+0)
				1011	den = (((((((1.05075_00716_44416_84324e-9 * r +
				1012	5.47593_80849_95344_94600e-4) * r +
				1013	1.51986_66563_61645_71966e-2) * r +
				1014	1.48103_97642_74800_74590e-1) * r +
				1015	6.89767_33498_51000_04550e-1) * r +
				1016	1.67638_48301_83803_84940e+0) * r +
				1017	2.05319_16266_37758_82187e+0) * r +
				1018	1.0)
				1019	else:
				1020	r = r - 5.0
				1021	# Hash sum: 47.52583_31754_92896_71629
				1022	num = (((((((2.01033_43992_92288_13265e-7 * r +
				1023	2.71155_55687_43487_57815e-5) * r +
				1024	1.24266_09473_88078_43860e-3) * r +
				1025	2.65321_89526_57612_30930e-2) * r +
				1026	2.96560_57182_85048_91230e-1) * r +
				1027	1.78482_65399_17291_33580e+0) * r +
				1028	5.46378_49111_64114_36990e+0) * r +
				1029	6.65790_46435_01103_77720e+0)
				1030	den = (((((((2.04426_31033_89939_78564e-15 * r +
				1031	1.42151_17583_16445_88870e-7) * r +
				1032	1.84631_83175_10054_68180e-5) * r +
				1033	7.86869_13114_56132_59100e-4) * r +
				1034	1.48753_61290_85061_48525e-2) * r +
				1035	1.36929_88092_27358_05310e-1) * r +
				1036	5.99832_20655_58879_37690e-1) * r +
				1037	1.0)
				1038	x = num / den
				1039	if q < 0.0:
				1040	x = -x
				1041	return mu + (x * sigma)
				1042
				1043
Raymond Hettinger	0400a7f	2020-05-02 19:30:24 -0700	[diff] [blame]	1044	# If available, use C implementation
				1045	try:
				1046	from _statistics import _normal_dist_inv_cdf
				1047	except ImportError:
				1048	pass
				1049
				1050
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1051	class NormalDist:
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1052	"Normal distribution of a random variable"
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1053	# https://en.wikipedia.org/wiki/Normal_distribution
				1054	# https://en.wikipedia.org/wiki/Variance#Properties
				1055
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1056	__slots__ = {
				1057	'_mu': 'Arithmetic mean of a normal distribution',
				1058	'_sigma': 'Standard deviation of a normal distribution',
				1059	}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1060
				1061	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1062	"NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1063	if sigma < 0.0:
				1064	raise StatisticsError('sigma must be non-negative')
Raymond Hettinger	e4810b2	2019-09-05 00:18:47 -0700	[diff] [blame]	1065	self._mu = float(mu)
				1066	self._sigma = float(sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1067
				1068	@classmethod
				1069	def from_samples(cls, data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1070	"Make a normal distribution instance from sample data."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1071	if not isinstance(data, (list, tuple)):
				1072	data = list(data)
				1073	xbar = fmean(data)
				1074	return cls(xbar, stdev(data, xbar))
				1075
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	1076	def samples(self, n, *, seed=None):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1077	"Generate n samples for a given mean and standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1078	gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1079	mu, sigma = self._mu, self._sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1080	return [gauss(mu, sigma) for i in range(n)]
				1081
				1082	def pdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1083	"Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1084	variance = self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1085	if not variance:
				1086	raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1087	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1088
				1089	def cdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1090	"Cumulative distribution function. P(X <= x)"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1091	if not self._sigma:
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1092	raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1093	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1094
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1095	def inv_cdf(self, p):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1096	"""Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1097
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1098	Finds the value of the random variable such that the probability of
				1099	the variable being less than or equal to that value equals the given
				1100	probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1101
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1102	This function is also called the percent point function or quantile
				1103	function.
				1104	"""
				1105	if p <= 0.0 or p >= 1.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1106	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1107	if self._sigma <= 0.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1108	raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	1109	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	1110
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	1111	def quantiles(self, n=4):
				1112	"""Divide into n continuous intervals with equal probability.
				1113
				1114	Returns a list of (n - 1) cut points separating the intervals.
				1115
				1116	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				1117	Set n to 100 for percentiles which gives the 99 cuts points that
				1118	separate the normal distribution in to 100 equal sized groups.
				1119	"""
				1120	return [self.inv_cdf(i / n) for i in range(1, n)]
				1121
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1122	def overlap(self, other):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1123	"""Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1124
				1125	Measures the agreement between two normal probability distributions.
				1126	Returns a value between 0.0 and 1.0 giving the overlapping area in
				1127	the two underlying probability density functions.
				1128
				1129	>>> N1 = NormalDist(2.4, 1.6)
				1130	>>> N2 = NormalDist(3.2, 2.0)
				1131	>>> N1.overlap(N2)
				1132	0.8035050657330205
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1133	"""
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1134	# See: "The overlapping coefficient as a measure of agreement between
				1135	# probability distributions and point estimation of the overlap of two
				1136	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				1137	# http://dx.doi.org/10.1080/03610928908830127
				1138	if not isinstance(other, NormalDist):
				1139	raise TypeError('Expected another NormalDist instance')
				1140	X, Y = self, other
Raymond Hettinger	5aad027	2020-06-13 19:17:28 -0700	[diff] [blame]	1141	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1142	X, Y = Y, X
				1143	X_var, Y_var = X.variance, Y.variance
				1144	if not X_var or not Y_var:
				1145	raise StatisticsError('overlap() not defined when sigma is zero')
				1146	dv = Y_var - X_var
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1147	dm = fabs(Y._mu - X._mu)
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1148	if not dv:
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1149	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				1150	a = X._mu * Y_var - Y._mu * X_var
				1151	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1152	x1 = (a + b) / dv
				1153	x2 = (a - b) / dv
				1154	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				1155
Raymond Hettinger	70f027d	2020-04-16 10:25:14 -0700	[diff] [blame]	1156	def zscore(self, x):
				1157	"""Compute the Standard Score. (x - mean) / stdev
				1158
				1159	Describes x in terms of the number of standard deviations
				1160	above or below the mean of the normal distribution.
				1161	"""
				1162	# https://www.statisticshowto.com/probability-and-statistics/z-score/
				1163	if not self._sigma:
				1164	raise StatisticsError('zscore() not defined when sigma is zero')
				1165	return (x - self._mu) / self._sigma
				1166
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1167	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1168	def mean(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1169	"Arithmetic mean of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1170	return self._mu
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1171
				1172	@property
Raymond Hettinger	4db25d5	2019-09-08 16:57:58 -0700	[diff] [blame]	1173	def median(self):
				1174	"Return the median of the normal distribution"
				1175	return self._mu
				1176
				1177	@property
				1178	def mode(self):
				1179	"""Return the mode of the normal distribution
				1180
				1181	The mode is the value x where which the probability density
				1182	function (pdf) takes its maximum value.
				1183	"""
				1184	return self._mu
				1185
				1186	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1187	def stdev(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1188	"Standard deviation of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1189	return self._sigma
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1190
				1191	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1192	def variance(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1193	"Square of the standard deviation."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1194	return self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1195
				1196	def __add__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1197	"""Add a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1198
				1199	If other is a constant, translate mu by the constant,
				1200	leaving sigma unchanged.
				1201
				1202	If other is a NormalDist, add both the means and the variances.
				1203	Mathematically, this works only if the two distributions are
				1204	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1205	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1206	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1207	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1208	return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1209
				1210	def __sub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1211	"""Subtract a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1212
				1213	If other is a constant, translate by the constant mu,
				1214	leaving sigma unchanged.
				1215
				1216	If other is a NormalDist, subtract the means and add the variances.
				1217	Mathematically, this works only if the two distributions are
				1218	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1219	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1220	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1221	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1222	return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1223
				1224	def __mul__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1225	"""Multiply both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1226
				1227	Used for rescaling, perhaps to change measurement units.
				1228	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1229	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1230	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1231
				1232	def __truediv__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1233	"""Divide both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1234
				1235	Used for rescaling, perhaps to change measurement units.
				1236	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1237	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1238	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1239
				1240	def __pos__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1241	"Return a copy of the instance."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1242	return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1243
				1244	def __neg__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1245	"Negates mu while keeping sigma the same."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1246	return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1247
				1248	__radd__ = __add__
				1249
				1250	def __rsub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1251	"Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1252	return -(x1 - x2)
				1253
				1254	__rmul__ = __mul__
				1255
				1256	def __eq__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1257	"Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1258	if not isinstance(x2, NormalDist):
				1259	return NotImplemented
Raymond Hettinger	5eabec0	2019-10-18 14:20:35 -0700	[diff] [blame]	1260	return x1._mu == x2._mu and x1._sigma == x2._sigma
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1261
				1262	def __hash__(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1263	"NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1264	return hash((self._mu, self._sigma))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1265
				1266	def __repr__(self):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1267	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'