Blame - Lib/statistics.py - platform/external/python/cpython3

blob: c7d6568145e0fa8a425026886276d1626ec6a880 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
				76	Exceptions
				77	----------
				78
				79	A single exception is defined: StatisticsError is a subclass of ValueError.
				80
				81	"""
				82
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	83	__all__ = [
				84	'NormalDist',
				85	'StatisticsError',
				86	'fmean',
				87	'geometric_mean',
				88	'harmonic_mean',
				89	'mean',
				90	'median',
				91	'median_grouped',
				92	'median_high',
				93	'median_low',
				94	'mode',
				95	'multimode',
				96	'pstdev',
				97	'pvariance',
				98	'quantiles',
				99	'stdev',
				100	'variance',
				101	]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	102
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	103	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	104	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	105	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	106
				107	from fractions import Fraction
				108	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	109	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	110	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	111	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	112	from operator import itemgetter
				113	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115	# === Exceptions ===
				116
				117	class StatisticsError(ValueError):
				118	pass
				119
				120
				121	# === Private utilities ===
				122
				123	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	124	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	125
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	126	Return a high-precision sum of the given numeric data as a fraction,
				127	together with the type to be converted to and the count of items.
				128
				129	If optional argument ``start`` is given, it is added to the total.
				130	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	131
				132
				133	Examples
				134	--------
				135
				136	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
				139	Some sources of round-off error will be avoided:
				140
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	141	# Built-in sum returns zero.
				142	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	143	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	144
				145	Fractions and Decimals are also supported:
				146
				147	>>> from fractions import Fraction as F
				148	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	149	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	150
				151	>>> from decimal import Decimal as D
				152	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				153	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	154	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	155
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	156	Mixed types are currently treated as an error, except that int is
				157	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	158	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	159	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	160	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	161	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	163	T = _coerce(int, type(start))
				164	for typ, values in groupby(data, type):
				165	T = _coerce(T, typ) # or raise TypeError
				166	for n,d in map(_exact_ratio, values):
				167	count += 1
				168	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	169	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	170	# The sum will be a NAN or INF. We can ignore all the finite
				171	# partials, and just look at this special one.
				172	total = partials[None]
				173	assert not _isfinite(total)
				174	else:
				175	# Sum all the partial sums using builtin sum.
				176	# FIXME is this faster if we sum them in order of the denominator?
				177	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				178	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	179
				180
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	181	def _isfinite(x):
				182	try:
				183	return x.is_finite() # Likely a Decimal.
				184	except AttributeError:
				185	return math.isfinite(x) # Coerces to float first.
				186
				187
				188	def _coerce(T, S):
				189	"""Coerce types T and S to a common type, or raise TypeError.
				190
				191	Coercion rules are currently an implementation detail. See the CoerceTest
				192	test class in test_statistics for details.
				193	"""
				194	# See http://bugs.python.org/issue24068.
				195	assert T is not bool, "initial type T is bool"
				196	# If the types are the same, no need to coerce anything. Put this
				197	# first, so that the usual case (no coercion needed) happens as soon
				198	# as possible.
				199	if T is S: return T
				200	# Mixed int & other coerce to the other type.
				201	if S is int or S is bool: return T
				202	if T is int: return S
				203	# If one is a (strict) subclass of the other, coerce to the subclass.
				204	if issubclass(S, T): return S
				205	if issubclass(T, S): return T
				206	# Ints coerce to the other type.
				207	if issubclass(T, int): return S
				208	if issubclass(S, int): return T
				209	# Mixed fraction & float coerces to float (or float subclass).
				210	if issubclass(T, Fraction) and issubclass(S, float):
				211	return S
				212	if issubclass(T, float) and issubclass(S, Fraction):
				213	return T
				214	# Any other combination is disallowed.
				215	msg = "don't know how to coerce %s and %s"
				216	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	217
				218
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	219	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	220	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	221
				222	>>> _exact_ratio(0.25)
				223	(1, 4)
				224
				225	x is expected to be an int, Fraction, Decimal or float.
				226	"""
				227	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	228	# Optimise the common case of floats. We expect that the most often
				229	# used numeric type will be builtin floats, so try to make this as
				230	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	231	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	232	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	233	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	235	return (x.numerator, x.denominator)
				236	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	237	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	238	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	239	return x.as_integer_ratio()
				240	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	241	# Just give up?
				242	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	243	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	244	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	245	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	246	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	247	msg = "can't convert type '{}' to numerator/denominator"
				248	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	249
				250
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	251	def _convert(value, T):
				252	"""Convert value to given numeric type T."""
				253	if type(value) is T:
				254	# This covers the cases where T is Fraction, or where value is
				255	# a NAN or INF (Decimal or float).
				256	return value
				257	if issubclass(T, int) and value.denominator != 1:
				258	T = float
				259	try:
				260	# FIXME: what do we do if this overflows?
				261	return T(value)
				262	except TypeError:
				263	if issubclass(T, Decimal):
				264	return T(value.numerator)/T(value.denominator)
				265	else:
				266	raise
				267
				268
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	269	def _find_lteq(a, x):
				270	'Locate the leftmost value exactly equal to x'
				271	i = bisect_left(a, x)
				272	if i != len(a) and a[i] == x:
				273	return i
				274	raise ValueError
				275
				276
				277	def _find_rteq(a, l, x):
				278	'Locate the rightmost value exactly equal to x'
				279	i = bisect_right(a, x, lo=l)
				280	if i != (len(a)+1) and a[i-1] == x:
				281	return i-1
				282	raise ValueError
				283
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	284
				285	def _fail_neg(values, errmsg='negative value'):
				286	"""Iterate over values, failing if any are less than zero."""
				287	for x in values:
				288	if x < 0:
				289	raise StatisticsError(errmsg)
				290	yield x
				291
				292
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	293	# === Measures of central tendency (averages) ===
				294
				295	def mean(data):
				296	"""Return the sample arithmetic mean of data.
				297
				298	>>> mean([1, 2, 3, 4, 4])
				299	2.8
				300
				301	>>> from fractions import Fraction as F
				302	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				303	Fraction(13, 21)
				304
				305	>>> from decimal import Decimal as D
				306	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				307	Decimal('0.5625')
				308
				309	If ``data`` is empty, StatisticsError will be raised.
				310	"""
				311	if iter(data) is data:
				312	data = list(data)
				313	n = len(data)
				314	if n < 1:
				315	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	316	T, total, count = _sum(data)
				317	assert count == n
				318	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	319
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	320
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	321	def fmean(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	322	"""Convert data to floats and compute the arithmetic mean.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	323
				324	This runs faster than the mean() function and it always returns a float.
				325	The result is highly accurate but not as perfect as mean().
				326	If the input dataset is empty, it raises a StatisticsError.
				327
				328	>>> fmean([3.5, 4.0, 5.25])
				329	4.25
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	330	"""
				331	try:
				332	n = len(data)
				333	except TypeError:
				334	# Handle iterators that do not define __len__().
				335	n = 0
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	336	def count(iterable):
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	337	nonlocal n
Raymond Hettinger	6c01ebc	2019-06-05 07:39:38 -0700	[diff] [blame]	338	for n, x in enumerate(iterable, start=1):
				339	yield x
				340	total = fsum(count(data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	341	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	342	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	343	try:
				344	return total / n
				345	except ZeroDivisionError:
				346	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	347
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	348
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	349	def geometric_mean(data):
				350	"""Convert data to floats and compute the geometric mean.
				351
				352	Raises a StatisticsError if the input dataset is empty,
				353	if it contains a zero, or if it contains a negative value.
				354
				355	No special efforts are made to achieve exact results.
				356	(However, this may change in the future.)
				357
				358	>>> round(geometric_mean([54, 24, 36]), 9)
				359	36.0
				360	"""
				361	try:
				362	return exp(fmean(map(log, data)))
				363	except ValueError:
				364	raise StatisticsError('geometric mean requires a non-empty dataset '
				365	' containing positive numbers') from None
				366
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	367
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	368	def harmonic_mean(data):
				369	"""Return the harmonic mean of data.
				370
				371	The harmonic mean, sometimes called the subcontrary mean, is the
				372	reciprocal of the arithmetic mean of the reciprocals of the data,
				373	and is often appropriate when averaging quantities which are rates
				374	or ratios, for example speeds. Example:
				375
				376	Suppose an investor purchases an equal value of shares in each of
				377	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				378	What is the average P/E ratio for the investor's portfolio?
				379
				380	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				381	3.6
				382
				383	Using the arithmetic mean would give an average of about 5.167, which
				384	is too high.
				385
				386	If ``data`` is empty, or any element is less than zero,
				387	``harmonic_mean`` will raise ``StatisticsError``.
				388	"""
				389	# For a justification for using harmonic mean for P/E ratios, see
				390	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				391	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				392	if iter(data) is data:
				393	data = list(data)
				394	errmsg = 'harmonic mean does not support negative values'
				395	n = len(data)
				396	if n < 1:
				397	raise StatisticsError('harmonic_mean requires at least one data point')
				398	elif n == 1:
				399	x = data[0]
				400	if isinstance(x, (numbers.Real, Decimal)):
				401	if x < 0:
				402	raise StatisticsError(errmsg)
				403	return x
				404	else:
				405	raise TypeError('unsupported type')
				406	try:
				407	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				408	except ZeroDivisionError:
				409	return 0
				410	assert count == n
				411	return _convert(n/total, T)
				412
				413
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	414	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				415	def median(data):
				416	"""Return the median (middle value) of numeric data.
				417
				418	When the number of data points is odd, return the middle data point.
				419	When the number of data points is even, the median is interpolated by
				420	taking the average of the two middle values:
				421
				422	>>> median([1, 3, 5])
				423	3
				424	>>> median([1, 3, 5, 7])
				425	4.0
				426
				427	"""
				428	data = sorted(data)
				429	n = len(data)
				430	if n == 0:
				431	raise StatisticsError("no median for empty data")
				432	if n%2 == 1:
				433	return data[n//2]
				434	else:
				435	i = n//2
				436	return (data[i - 1] + data[i])/2
				437
				438
				439	def median_low(data):
				440	"""Return the low median of numeric data.
				441
				442	When the number of data points is odd, the middle value is returned.
				443	When it is even, the smaller of the two middle values is returned.
				444
				445	>>> median_low([1, 3, 5])
				446	3
				447	>>> median_low([1, 3, 5, 7])
				448	3
				449
				450	"""
				451	data = sorted(data)
				452	n = len(data)
				453	if n == 0:
				454	raise StatisticsError("no median for empty data")
				455	if n%2 == 1:
				456	return data[n//2]
				457	else:
				458	return data[n//2 - 1]
				459
				460
				461	def median_high(data):
				462	"""Return the high median of data.
				463
				464	When the number of data points is odd, the middle value is returned.
				465	When it is even, the larger of the two middle values is returned.
				466
				467	>>> median_high([1, 3, 5])
				468	3
				469	>>> median_high([1, 3, 5, 7])
				470	5
				471
				472	"""
				473	data = sorted(data)
				474	n = len(data)
				475	if n == 0:
				476	raise StatisticsError("no median for empty data")
				477	return data[n//2]
				478
				479
				480	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	481	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	482
				483	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				484	3.7
				485	>>> median_grouped([52, 52, 53, 54])
				486	52.5
				487
				488	This calculates the median as the 50th percentile, and should be
				489	used when your data is continuous and grouped. In the above example,
				490	the values 1, 2, 3, etc. actually represent the midpoint of classes
				491	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				492	class 3.5-4.5, and interpolation is used to estimate it.
				493
				494	Optional argument ``interval`` represents the class interval, and
				495	defaults to 1. Changing the class interval naturally will change the
				496	interpolated 50th percentile value:
				497
				498	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				499	3.25
				500	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				501	3.5
				502
				503	This function does not check whether the data points are at least
				504	``interval`` apart.
				505	"""
				506	data = sorted(data)
				507	n = len(data)
				508	if n == 0:
				509	raise StatisticsError("no median for empty data")
				510	elif n == 1:
				511	return data[0]
				512	# Find the value at the midpoint. Remember this corresponds to the
				513	# centre of the class interval.
				514	x = data[n//2]
				515	for obj in (x, interval):
				516	if isinstance(obj, (str, bytes)):
				517	raise TypeError('expected number but got %r' % obj)
				518	try:
				519	L = x - interval/2 # The lower limit of the median interval.
				520	except TypeError:
				521	# Mixed type. For now we just coerce to float.
				522	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	523
				524	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	525	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	526	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	527	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	528	# Assuming always l1 <= l2
				529	l2 = _find_rteq(data, l1, x)
				530	cf = l1
				531	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	532	return L + interval*(n/2 - cf)/f
				533
				534
				535	def mode(data):
				536	"""Return the most common data point from discrete or nominal data.
				537
				538	``mode`` assumes discrete data, and returns a single value. This is the
				539	standard treatment of the mode as commonly taught in schools:
				540
				541	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				542	3
				543
				544	This also works with nominal (non-numeric) data:
				545
				546	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				547	'red'
				548
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	549	If there are multiple modes, return the first one encountered.
				550
				551	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				552	'red'
				553
				554	If data is empty, ``mode``, raises StatisticsError.
				555
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	556	"""
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	557	data = iter(data)
				558	try:
				559	return Counter(data).most_common(1)[0][0]
				560	except IndexError:
				561	raise StatisticsError('no mode for empty data') from None
				562
				563
				564	def multimode(data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	565	"""Return a list of the most frequently occurring values.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	566
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	567	Will return more than one result if there are multiple modes
				568	or an empty list if data is empty.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	569
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	570	>>> multimode('aabbbbbbbbcc')
				571	['b']
				572	>>> multimode('aabbbbccddddeeffffgg')
				573	['b', 'd', 'f']
				574	>>> multimode('')
				575	[]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	576	"""
				577	counts = Counter(iter(data)).most_common()
				578	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				579	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	580
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	581
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	582	# Notes on methods for computing quantiles
				583	# ----------------------------------------
				584	#
				585	# There is no one perfect way to compute quantiles. Here we offer
				586	# two methods that serve common needs. Most other packages
				587	# surveyed offered at least one or both of these two, making them
				588	# "standard" in the sense of "widely-adopted and reproducible".
				589	# They are also easy to explain, easy to compute manually, and have
				590	# straight-forward interpretations that aren't surprising.
				591
				592	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				593	# value of rank order statistics". The alternative method is known as
				594	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				595
				596	# For sample data where there is a positive probability for values
				597	# beyond the range of the data, the R6 exclusive method is a
				598	# reasonable choice. Consider a random sample of nine values from a
				599	# population with a uniform distribution from 0.0 to 100.0. The
				600	# distribution of the third ranked sample point is described by
				601	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				602	# mean=0.300. Only the latter (which corresponds with R6) gives the
				603	# desired cut point with 30% of the population falling below that
				604	# value, making it comparable to a result from an inv_cdf() function.
				605
				606	# For describing population data where the end points are known to
				607	# be included in the data, the R7 inclusive method is a reasonable
				608	# choice. Instead of the mean, it uses the mode of the beta
				609	# distribution for the interior points. Per Hyndman & Fan, "One nice
				610	# property is that the vertices of Q7(p) divide the range into n - 1
				611	# intervals, and exactly 100p% of the intervals lie to the left of
				612	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				613
Raymond Hettinger	eed5e9a	2019-07-19 01:57:22 -0700	[diff] [blame]	614	# If needed, other methods could be added. However, for now, the
				615	# position is that fewer options make for easier choices and that
				616	# external packages can be used for anything more advanced.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	617
Raymond Hettinger	1791128	2019-06-25 04:39:22 +0200	[diff] [blame]	618	def quantiles(dist, /, *, n=4, method='exclusive'):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	619	"""Divide dist into n continuous intervals with equal probability.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	620
				621	Returns a list of (n - 1) cut points separating the intervals.
				622
				623	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				624	Set n to 100 for percentiles which gives the 99 cuts points that
				625	separate dist in to 100 equal sized groups.
				626
				627	The dist can be any iterable containing sample data or it can be
				628	an instance of a class that defines an inv_cdf() method. For sample
				629	data, the cut points are linearly interpolated between data points.
				630
				631	If method is set to inclusive, dist is treated as population
				632	data. The minimum value is treated as the 0th percentile and the
				633	maximum value is treated as the 100th percentile.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	634	"""
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	635	if n < 1:
				636	raise StatisticsError('n must be at least 1')
				637	if hasattr(dist, 'inv_cdf'):
				638	return [dist.inv_cdf(i / n) for i in range(1, n)]
				639	data = sorted(dist)
				640	ld = len(data)
				641	if ld < 2:
				642	raise StatisticsError('must have at least two data points')
				643	if method == 'inclusive':
				644	m = ld - 1
				645	result = []
				646	for i in range(1, n):
				647	j = i * m // n
				648	delta = im - jn
				649	interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
				650	result.append(interpolated)
				651	return result
				652	if method == 'exclusive':
				653	m = ld + 1
				654	result = []
				655	for i in range(1, n):
				656	j = i * m // n # rescale i to m/n
				657	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				658	delta = im - jn # exact integer math
				659	interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
				660	result.append(interpolated)
				661	return result
				662	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	663
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	664
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	665	# === Measures of spread ===
				666
				667	# See http://mathworld.wolfram.com/Variance.html
				668	# http://mathworld.wolfram.com/SampleVariance.html
				669	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				670	#
				671	# Under no circumstances use the so-called "computational formula for
				672	# variance", as that is only suitable for hand calculations with a small
				673	# amount of low-precision data. It has terrible numeric properties.
				674	#
				675	# See a comparison of three computational methods here:
				676	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				677
				678	def _ss(data, c=None):
				679	"""Return sum of square deviations of sequence data.
				680
				681	If ``c`` is None, the mean is calculated in one pass, and the deviations
				682	from the mean are calculated in a second pass. Otherwise, deviations are
				683	calculated from ``c`` as given. Use the second case with care, as it can
				684	lead to garbage results.
				685	"""
				686	if c is None:
				687	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	688	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	689	# The following sum should mathematically equal zero, but due to rounding
				690	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	691	U, total2, count2 = _sum((x-c) for x in data)
				692	assert T == U and count == count2
				693	total -= total2**2/len(data)
				694	assert not total < 0, 'negative sum of square deviations: %f' % total
				695	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	696
				697
				698	def variance(data, xbar=None):
				699	"""Return the sample variance of data.
				700
				701	data should be an iterable of Real-valued numbers, with at least two
				702	values. The optional argument xbar, if given, should be the mean of
				703	the data. If it is missing or None, the mean is automatically calculated.
				704
				705	Use this function when your data is a sample from a population. To
				706	calculate the variance from the entire population, see ``pvariance``.
				707
				708	Examples:
				709
				710	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				711	>>> variance(data)
				712	1.3720238095238095
				713
				714	If you have already calculated the mean of your data, you can pass it as
				715	the optional second argument ``xbar`` to avoid recalculating it:
				716
				717	>>> m = mean(data)
				718	>>> variance(data, m)
				719	1.3720238095238095
				720
				721	This function does not check that ``xbar`` is actually the mean of
				722	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				723	impossible results.
				724
				725	Decimals and Fractions are supported:
				726
				727	>>> from decimal import Decimal as D
				728	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				729	Decimal('31.01875')
				730
				731	>>> from fractions import Fraction as F
				732	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				733	Fraction(67, 108)
				734
				735	"""
				736	if iter(data) is data:
				737	data = list(data)
				738	n = len(data)
				739	if n < 2:
				740	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	741	T, ss = _ss(data, xbar)
				742	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	743
				744
				745	def pvariance(data, mu=None):
				746	"""Return the population variance of ``data``.
				747
				748	data should be an iterable of Real-valued numbers, with at least one
				749	value. The optional argument mu, if given, should be the mean of
				750	the data. If it is missing or None, the mean is automatically calculated.
				751
				752	Use this function to calculate the variance from the entire population.
				753	To estimate the variance from a sample, the ``variance`` function is
				754	usually a better choice.
				755
				756	Examples:
				757
				758	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				759	>>> pvariance(data)
				760	1.25
				761
				762	If you have already calculated the mean of the data, you can pass it as
				763	the optional second argument to avoid recalculating it:
				764
				765	>>> mu = mean(data)
				766	>>> pvariance(data, mu)
				767	1.25
				768
				769	This function does not check that ``mu`` is actually the mean of ``data``.
				770	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				771	results.
				772
				773	Decimals and Fractions are supported:
				774
				775	>>> from decimal import Decimal as D
				776	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				777	Decimal('24.815')
				778
				779	>>> from fractions import Fraction as F
				780	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				781	Fraction(13, 72)
				782
				783	"""
				784	if iter(data) is data:
				785	data = list(data)
				786	n = len(data)
				787	if n < 1:
				788	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	789	T, ss = _ss(data, mu)
				790	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	791
				792
				793	def stdev(data, xbar=None):
				794	"""Return the square root of the sample variance.
				795
				796	See ``variance`` for arguments and other details.
				797
				798	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				799	1.0810874155219827
				800
				801	"""
				802	var = variance(data, xbar)
				803	try:
				804	return var.sqrt()
				805	except AttributeError:
				806	return math.sqrt(var)
				807
				808
				809	def pstdev(data, mu=None):
				810	"""Return the square root of the population variance.
				811
				812	See ``pvariance`` for arguments and other details.
				813
				814	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				815	0.986893273527251
				816
				817	"""
				818	var = pvariance(data, mu)
				819	try:
				820	return var.sqrt()
				821	except AttributeError:
				822	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	823
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	824
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	825	## Normal Distribution #####################################################
				826
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	827
				828	def _normal_dist_inv_cdf(p, mu, sigma):
				829	# There is no closed-form solution to the inverse CDF for the normal
				830	# distribution, so we use a rational approximation instead:
				831	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				832	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				833	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				834	q = p - 0.5
				835	if fabs(q) <= 0.425:
				836	r = 0.180625 - q * q
				837	# Hash sum: 55.88319_28806_14901_4439
				838	num = (((((((2.50908_09287_30122_6727e+3 * r +
				839	3.34305_75583_58812_8105e+4) * r +
				840	6.72657_70927_00870_0853e+4) * r +
				841	4.59219_53931_54987_1457e+4) * r +
				842	1.37316_93765_50946_1125e+4) * r +
				843	1.97159_09503_06551_4427e+3) * r +
				844	1.33141_66789_17843_7745e+2) * r +
				845	3.38713_28727_96366_6080e+0) * q
				846	den = (((((((5.22649_52788_52854_5610e+3 * r +
				847	2.87290_85735_72194_2674e+4) * r +
				848	3.93078_95800_09271_0610e+4) * r +
				849	2.12137_94301_58659_5867e+4) * r +
				850	5.39419_60214_24751_1077e+3) * r +
				851	6.87187_00749_20579_0830e+2) * r +
				852	4.23133_30701_60091_1252e+1) * r +
				853	1.0)
				854	x = num / den
				855	return mu + (x * sigma)
				856	r = p if q <= 0.0 else 1.0 - p
				857	r = sqrt(-log(r))
				858	if r <= 5.0:
				859	r = r - 1.6
				860	# Hash sum: 49.33206_50330_16102_89036
				861	num = (((((((7.74545_01427_83414_07640e-4 * r +
				862	2.27238_44989_26918_45833e-2) * r +
				863	2.41780_72517_74506_11770e-1) * r +
				864	1.27045_82524_52368_38258e+0) * r +
				865	3.64784_83247_63204_60504e+0) * r +
				866	5.76949_72214_60691_40550e+0) * r +
				867	4.63033_78461_56545_29590e+0) * r +
				868	1.42343_71107_49683_57734e+0)
				869	den = (((((((1.05075_00716_44416_84324e-9 * r +
				870	5.47593_80849_95344_94600e-4) * r +
				871	1.51986_66563_61645_71966e-2) * r +
				872	1.48103_97642_74800_74590e-1) * r +
				873	6.89767_33498_51000_04550e-1) * r +
				874	1.67638_48301_83803_84940e+0) * r +
				875	2.05319_16266_37758_82187e+0) * r +
				876	1.0)
				877	else:
				878	r = r - 5.0
				879	# Hash sum: 47.52583_31754_92896_71629
				880	num = (((((((2.01033_43992_92288_13265e-7 * r +
				881	2.71155_55687_43487_57815e-5) * r +
				882	1.24266_09473_88078_43860e-3) * r +
				883	2.65321_89526_57612_30930e-2) * r +
				884	2.96560_57182_85048_91230e-1) * r +
				885	1.78482_65399_17291_33580e+0) * r +
				886	5.46378_49111_64114_36990e+0) * r +
				887	6.65790_46435_01103_77720e+0)
				888	den = (((((((2.04426_31033_89939_78564e-15 * r +
				889	1.42151_17583_16445_88870e-7) * r +
				890	1.84631_83175_10054_68180e-5) * r +
				891	7.86869_13114_56132_59100e-4) * r +
				892	1.48753_61290_85061_48525e-2) * r +
				893	1.36929_88092_27358_05310e-1) * r +
				894	5.99832_20655_58879_37690e-1) * r +
				895	1.0)
				896	x = num / den
				897	if q < 0.0:
				898	x = -x
				899	return mu + (x * sigma)
				900
				901
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	902	class NormalDist:
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	903	"Normal distribution of a random variable"
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	904	# https://en.wikipedia.org/wiki/Normal_distribution
				905	# https://en.wikipedia.org/wiki/Variance#Properties
				906
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	907	__slots__ = {
				908	'_mu': 'Arithmetic mean of a normal distribution',
				909	'_sigma': 'Standard deviation of a normal distribution',
				910	}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	911
				912	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	913	"NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	914	if sigma < 0.0:
				915	raise StatisticsError('sigma must be non-negative')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	916	self._mu = mu
				917	self._sigma = sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	918
				919	@classmethod
				920	def from_samples(cls, data):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	921	"Make a normal distribution instance from sample data."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	922	if not isinstance(data, (list, tuple)):
				923	data = list(data)
				924	xbar = fmean(data)
				925	return cls(xbar, stdev(data, xbar))
				926
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	927	def samples(self, n, *, seed=None):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	928	"Generate n samples for a given mean and standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	929	gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	930	mu, sigma = self._mu, self._sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	931	return [gauss(mu, sigma) for i in range(n)]
				932
				933	def pdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	934	"Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	935	variance = self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	936	if not variance:
				937	raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	938	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	939
				940	def cdf(self, x):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	941	"Cumulative distribution function. P(X <= x)"
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	942	if not self._sigma:
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	943	raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	944	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	945
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	946	def inv_cdf(self, p):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	947	"""Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	948
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	949	Finds the value of the random variable such that the probability of
				950	the variable being less than or equal to that value equals the given
				951	probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	952
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	953	This function is also called the percent point function or quantile
				954	function.
				955	"""
				956	if p <= 0.0 or p >= 1.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	957	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	958	if self._sigma <= 0.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	959	raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	960	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	961
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	962	def overlap(self, other):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	963	"""Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	964
				965	Measures the agreement between two normal probability distributions.
				966	Returns a value between 0.0 and 1.0 giving the overlapping area in
				967	the two underlying probability density functions.
				968
				969	>>> N1 = NormalDist(2.4, 1.6)
				970	>>> N2 = NormalDist(3.2, 2.0)
				971	>>> N1.overlap(N2)
				972	0.8035050657330205
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	973	"""
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	974	# See: "The overlapping coefficient as a measure of agreement between
				975	# probability distributions and point estimation of the overlap of two
				976	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				977	# http://dx.doi.org/10.1080/03610928908830127
				978	if not isinstance(other, NormalDist):
				979	raise TypeError('Expected another NormalDist instance')
				980	X, Y = self, other
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	981	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	982	X, Y = Y, X
				983	X_var, Y_var = X.variance, Y.variance
				984	if not X_var or not Y_var:
				985	raise StatisticsError('overlap() not defined when sigma is zero')
				986	dv = Y_var - X_var
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	987	dm = fabs(Y._mu - X._mu)
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	988	if not dv:
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	989	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				990	a = X._mu * Y_var - Y._mu * X_var
				991	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	992	x1 = (a + b) / dv
				993	x2 = (a - b) / dv
				994	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				995
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	996	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	997	def mean(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	998	"Arithmetic mean of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	999	return self._mu
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1000
				1001	@property
				1002	def stdev(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1003	"Standard deviation of the normal distribution."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1004	return self._sigma
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1005
				1006	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1007	def variance(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1008	"Square of the standard deviation."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1009	return self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1010
				1011	def __add__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1012	"""Add a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1013
				1014	If other is a constant, translate mu by the constant,
				1015	leaving sigma unchanged.
				1016
				1017	If other is a NormalDist, add both the means and the variances.
				1018	Mathematically, this works only if the two distributions are
				1019	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1020	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1021	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1022	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1023	return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1024
				1025	def __sub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1026	"""Subtract a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1027
				1028	If other is a constant, translate by the constant mu,
				1029	leaving sigma unchanged.
				1030
				1031	If other is a NormalDist, subtract the means and add the variances.
				1032	Mathematically, this works only if the two distributions are
				1033	independent or if they are jointly normally distributed.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1034	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1035	if isinstance(x2, NormalDist):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1036	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1037	return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1038
				1039	def __mul__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1040	"""Multiply both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1041
				1042	Used for rescaling, perhaps to change measurement units.
				1043	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1044	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1045	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1046
				1047	def __truediv__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1048	"""Divide both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1049
				1050	Used for rescaling, perhaps to change measurement units.
				1051	Sigma is scaled with the absolute value of the constant.
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1052	"""
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1053	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1054
				1055	def __pos__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1056	"Return a copy of the instance."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1057	return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1058
				1059	def __neg__(x1):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1060	"Negates mu while keeping sigma the same."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1061	return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1062
				1063	__radd__ = __add__
				1064
				1065	def __rsub__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1066	"Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1067	return -(x1 - x2)
				1068
				1069	__rmul__ = __mul__
				1070
				1071	def __eq__(x1, x2):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1072	"Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1073	if not isinstance(x2, NormalDist):
				1074	return NotImplemented
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1075	return (x1._mu, x2._sigma) == (x2._mu, x2._sigma)
				1076
				1077	def __hash__(self):
Raymond Hettinger	1c0e9bb	2019-07-21 12:13:07 -0700	[diff] [blame]	1078	"NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1079	return hash((self._mu, self._sigma))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1080
				1081	def __repr__(self):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1082	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1083
Dong-hee Na	0a18ee4	2019-08-24 07:20:30 +0900	[diff] [blame]	1084	# If available, use C implementation
				1085	try:
				1086	from _statistics import _normal_dist_inv_cdf
				1087	except ImportError:
				1088	pass
				1089
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1090
				1091	if __name__ == '__main__':
				1092
				1093	# Show math operations computed analytically in comparsion
				1094	# to a monte carlo simulation of the same operations
				1095
				1096	from math import isclose
				1097	from operator import add, sub, mul, truediv
				1098	from itertools import repeat
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1099	import doctest
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1100
				1101	g1 = NormalDist(10, 20)
				1102	g2 = NormalDist(-5, 25)
				1103
				1104	# Test scaling by a constant
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1105	assert (g1 * 5 / 5).mean == g1.mean
				1106	assert (g1 * 5 / 5).stdev == g1.stdev
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1107
				1108	n = 100_000
				1109	G1 = g1.samples(n)
				1110	G2 = g2.samples(n)
				1111
				1112	for func in (add, sub):
				1113	print(f'\nTest {func.__name__} with another NormalDist:')
				1114	print(func(g1, g2))
				1115	print(NormalDist.from_samples(map(func, G1, G2)))
				1116
				1117	const = 11
				1118	for func in (add, sub, mul, truediv):
				1119	print(f'\nTest {func.__name__} with a constant:')
				1120	print(func(g1, const))
				1121	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				1122
				1123	const = 19
				1124	for func in (add, sub, mul):
				1125	print(f'\nTest constant with {func.__name__}:')
				1126	print(func(const, g1))
				1127	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				1128
				1129	def assert_close(G1, G2):
Raymond Hettinger	02c91f5	2019-07-21 00:34:47 -0700	[diff] [blame]	1130	assert isclose(G1.mean, G1.mean, rel_tol=0.01), (G1, G2)
				1131	assert isclose(G1.stdev, G2.stdev, rel_tol=0.01), (G1, G2)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1132
				1133	X = NormalDist(-105, 73)
				1134	Y = NormalDist(31, 47)
				1135	s = 32.75
				1136	n = 100_000
				1137
				1138	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				1139	assert_close(X + s, S)
				1140
				1141	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				1142	assert_close(X - s, S)
				1143
				1144	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				1145	assert_close(X * s, S)
				1146
				1147	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				1148	assert_close(X / s, S)
				1149
				1150	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				1151	Y.samples(n))])
				1152	assert_close(X + Y, S)
				1153
				1154	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				1155	Y.samples(n))])
				1156	assert_close(X - Y, S)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1157
				1158	print(doctest.testmod())