Blame - Lib/statistics.py - platform/external/python/cpython3

blob: c5c6e47fb3f3eb3fb10a37e52d46f9f43fc7eea8 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	10	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	11	Function Description
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	12	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	13	mean Arithmetic mean (average) of data.
Raymond Hettinger	7280048	2019-04-23 01:35:16 -0700	[diff] [blame]	14	fmean Fast, floating point arithmetic mean.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	15	geometric_mean Geometric mean of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	16	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	22	multimode List of modes (most common values of data).
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
				76	Exceptions
				77	----------
				78
				79	A single exception is defined: StatisticsError is a subclass of ValueError.
				80
				81	"""
				82
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	83	__all__ = [
				84	'NormalDist',
				85	'StatisticsError',
				86	'fmean',
				87	'geometric_mean',
				88	'harmonic_mean',
				89	'mean',
				90	'median',
				91	'median_grouped',
				92	'median_high',
				93	'median_low',
				94	'mode',
				95	'multimode',
				96	'pstdev',
				97	'pvariance',
				98	'quantiles',
				99	'stdev',
				100	'variance',
				101	]
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	102
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	103	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	104	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	105	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	106
				107	from fractions import Fraction
				108	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	109	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	110	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	111	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	112	from operator import itemgetter
				113	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115	# === Exceptions ===
				116
				117	class StatisticsError(ValueError):
				118	pass
				119
				120
				121	# === Private utilities ===
				122
				123	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	124	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	125
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	126	Return a high-precision sum of the given numeric data as a fraction,
				127	together with the type to be converted to and the count of items.
				128
				129	If optional argument ``start`` is given, it is added to the total.
				130	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	131
				132
				133	Examples
				134	--------
				135
				136	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
				139	Some sources of round-off error will be avoided:
				140
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	141	# Built-in sum returns zero.
				142	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	143	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	144
				145	Fractions and Decimals are also supported:
				146
				147	>>> from fractions import Fraction as F
				148	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	149	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	150
				151	>>> from decimal import Decimal as D
				152	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				153	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	154	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	155
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	156	Mixed types are currently treated as an error, except that int is
				157	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	158	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	159	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	160	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	161	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	163	T = _coerce(int, type(start))
				164	for typ, values in groupby(data, type):
				165	T = _coerce(T, typ) # or raise TypeError
				166	for n,d in map(_exact_ratio, values):
				167	count += 1
				168	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	169	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	170	# The sum will be a NAN or INF. We can ignore all the finite
				171	# partials, and just look at this special one.
				172	total = partials[None]
				173	assert not _isfinite(total)
				174	else:
				175	# Sum all the partial sums using builtin sum.
				176	# FIXME is this faster if we sum them in order of the denominator?
				177	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				178	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	179
				180
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	181	def _isfinite(x):
				182	try:
				183	return x.is_finite() # Likely a Decimal.
				184	except AttributeError:
				185	return math.isfinite(x) # Coerces to float first.
				186
				187
				188	def _coerce(T, S):
				189	"""Coerce types T and S to a common type, or raise TypeError.
				190
				191	Coercion rules are currently an implementation detail. See the CoerceTest
				192	test class in test_statistics for details.
				193	"""
				194	# See http://bugs.python.org/issue24068.
				195	assert T is not bool, "initial type T is bool"
				196	# If the types are the same, no need to coerce anything. Put this
				197	# first, so that the usual case (no coercion needed) happens as soon
				198	# as possible.
				199	if T is S: return T
				200	# Mixed int & other coerce to the other type.
				201	if S is int or S is bool: return T
				202	if T is int: return S
				203	# If one is a (strict) subclass of the other, coerce to the subclass.
				204	if issubclass(S, T): return S
				205	if issubclass(T, S): return T
				206	# Ints coerce to the other type.
				207	if issubclass(T, int): return S
				208	if issubclass(S, int): return T
				209	# Mixed fraction & float coerces to float (or float subclass).
				210	if issubclass(T, Fraction) and issubclass(S, float):
				211	return S
				212	if issubclass(T, float) and issubclass(S, Fraction):
				213	return T
				214	# Any other combination is disallowed.
				215	msg = "don't know how to coerce %s and %s"
				216	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	217
				218
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	219	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	220	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	221
				222	>>> _exact_ratio(0.25)
				223	(1, 4)
				224
				225	x is expected to be an int, Fraction, Decimal or float.
				226	"""
				227	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	228	# Optimise the common case of floats. We expect that the most often
				229	# used numeric type will be builtin floats, so try to make this as
				230	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	231	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	232	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	233	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	235	return (x.numerator, x.denominator)
				236	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	237	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	238	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	239	return x.as_integer_ratio()
				240	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	241	# Just give up?
				242	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	243	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	244	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	245	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	246	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	247	msg = "can't convert type '{}' to numerator/denominator"
				248	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	249
				250
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	251	def _convert(value, T):
				252	"""Convert value to given numeric type T."""
				253	if type(value) is T:
				254	# This covers the cases where T is Fraction, or where value is
				255	# a NAN or INF (Decimal or float).
				256	return value
				257	if issubclass(T, int) and value.denominator != 1:
				258	T = float
				259	try:
				260	# FIXME: what do we do if this overflows?
				261	return T(value)
				262	except TypeError:
				263	if issubclass(T, Decimal):
				264	return T(value.numerator)/T(value.denominator)
				265	else:
				266	raise
				267
				268
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	269	def _find_lteq(a, x):
				270	'Locate the leftmost value exactly equal to x'
				271	i = bisect_left(a, x)
				272	if i != len(a) and a[i] == x:
				273	return i
				274	raise ValueError
				275
				276
				277	def _find_rteq(a, l, x):
				278	'Locate the rightmost value exactly equal to x'
				279	i = bisect_right(a, x, lo=l)
				280	if i != (len(a)+1) and a[i-1] == x:
				281	return i-1
				282	raise ValueError
				283
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	284
				285	def _fail_neg(values, errmsg='negative value'):
				286	"""Iterate over values, failing if any are less than zero."""
				287	for x in values:
				288	if x < 0:
				289	raise StatisticsError(errmsg)
				290	yield x
				291
				292
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	293	# === Measures of central tendency (averages) ===
				294
				295	def mean(data):
				296	"""Return the sample arithmetic mean of data.
				297
				298	>>> mean([1, 2, 3, 4, 4])
				299	2.8
				300
				301	>>> from fractions import Fraction as F
				302	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				303	Fraction(13, 21)
				304
				305	>>> from decimal import Decimal as D
				306	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				307	Decimal('0.5625')
				308
				309	If ``data`` is empty, StatisticsError will be raised.
				310	"""
				311	if iter(data) is data:
				312	data = list(data)
				313	n = len(data)
				314	if n < 1:
				315	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	316	T, total, count = _sum(data)
				317	assert count == n
				318	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	319
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	320
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	321	def fmean(data):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	322	"""Convert data to floats and compute the arithmetic mean.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	323
				324	This runs faster than the mean() function and it always returns a float.
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	325	If the input dataset is empty, it raises a StatisticsError.
				326
				327	>>> fmean([3.5, 4.0, 5.25])
				328	4.25
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	329	"""
				330	try:
				331	n = len(data)
				332	except TypeError:
				333	# Handle iterators that do not define __len__().
				334	n = 0
Miss Islington (bot)	9ddb777	2019-06-05 08:18:13 -0700	[diff] [blame]	335	def count(iterable):
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	336	nonlocal n
Miss Islington (bot)	9ddb777	2019-06-05 08:18:13 -0700	[diff] [blame]	337	for n, x in enumerate(iterable, start=1):
				338	yield x
				339	total = fsum(count(data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	340	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	341	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	342	try:
				343	return total / n
				344	except ZeroDivisionError:
				345	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	346
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	347
Raymond Hettinger	6463ba3	2019-04-07 09:20:03 -0700	[diff] [blame]	348	def geometric_mean(data):
				349	"""Convert data to floats and compute the geometric mean.
				350
				351	Raises a StatisticsError if the input dataset is empty,
				352	if it contains a zero, or if it contains a negative value.
				353
				354	No special efforts are made to achieve exact results.
				355	(However, this may change in the future.)
				356
				357	>>> round(geometric_mean([54, 24, 36]), 9)
				358	36.0
				359	"""
				360	try:
				361	return exp(fmean(map(log, data)))
				362	except ValueError:
				363	raise StatisticsError('geometric mean requires a non-empty dataset '
				364	' containing positive numbers') from None
				365
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	366
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	367	def harmonic_mean(data):
				368	"""Return the harmonic mean of data.
				369
				370	The harmonic mean, sometimes called the subcontrary mean, is the
				371	reciprocal of the arithmetic mean of the reciprocals of the data,
				372	and is often appropriate when averaging quantities which are rates
				373	or ratios, for example speeds. Example:
				374
				375	Suppose an investor purchases an equal value of shares in each of
				376	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				377	What is the average P/E ratio for the investor's portfolio?
				378
				379	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				380	3.6
				381
				382	Using the arithmetic mean would give an average of about 5.167, which
				383	is too high.
				384
				385	If ``data`` is empty, or any element is less than zero,
				386	``harmonic_mean`` will raise ``StatisticsError``.
				387	"""
				388	# For a justification for using harmonic mean for P/E ratios, see
				389	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				390	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				391	if iter(data) is data:
				392	data = list(data)
				393	errmsg = 'harmonic mean does not support negative values'
				394	n = len(data)
				395	if n < 1:
				396	raise StatisticsError('harmonic_mean requires at least one data point')
				397	elif n == 1:
				398	x = data[0]
				399	if isinstance(x, (numbers.Real, Decimal)):
				400	if x < 0:
				401	raise StatisticsError(errmsg)
				402	return x
				403	else:
				404	raise TypeError('unsupported type')
				405	try:
				406	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				407	except ZeroDivisionError:
				408	return 0
				409	assert count == n
				410	return _convert(n/total, T)
				411
				412
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	413	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				414	def median(data):
				415	"""Return the median (middle value) of numeric data.
				416
				417	When the number of data points is odd, return the middle data point.
				418	When the number of data points is even, the median is interpolated by
				419	taking the average of the two middle values:
				420
				421	>>> median([1, 3, 5])
				422	3
				423	>>> median([1, 3, 5, 7])
				424	4.0
				425
				426	"""
				427	data = sorted(data)
				428	n = len(data)
				429	if n == 0:
				430	raise StatisticsError("no median for empty data")
				431	if n%2 == 1:
				432	return data[n//2]
				433	else:
				434	i = n//2
				435	return (data[i - 1] + data[i])/2
				436
				437
				438	def median_low(data):
				439	"""Return the low median of numeric data.
				440
				441	When the number of data points is odd, the middle value is returned.
				442	When it is even, the smaller of the two middle values is returned.
				443
				444	>>> median_low([1, 3, 5])
				445	3
				446	>>> median_low([1, 3, 5, 7])
				447	3
				448
				449	"""
				450	data = sorted(data)
				451	n = len(data)
				452	if n == 0:
				453	raise StatisticsError("no median for empty data")
				454	if n%2 == 1:
				455	return data[n//2]
				456	else:
				457	return data[n//2 - 1]
				458
				459
				460	def median_high(data):
				461	"""Return the high median of data.
				462
				463	When the number of data points is odd, the middle value is returned.
				464	When it is even, the larger of the two middle values is returned.
				465
				466	>>> median_high([1, 3, 5])
				467	3
				468	>>> median_high([1, 3, 5, 7])
				469	5
				470
				471	"""
				472	data = sorted(data)
				473	n = len(data)
				474	if n == 0:
				475	raise StatisticsError("no median for empty data")
				476	return data[n//2]
				477
				478
				479	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	480	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	481
				482	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				483	3.7
				484	>>> median_grouped([52, 52, 53, 54])
				485	52.5
				486
				487	This calculates the median as the 50th percentile, and should be
				488	used when your data is continuous and grouped. In the above example,
				489	the values 1, 2, 3, etc. actually represent the midpoint of classes
				490	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				491	class 3.5-4.5, and interpolation is used to estimate it.
				492
				493	Optional argument ``interval`` represents the class interval, and
				494	defaults to 1. Changing the class interval naturally will change the
				495	interpolated 50th percentile value:
				496
				497	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				498	3.25
				499	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				500	3.5
				501
				502	This function does not check whether the data points are at least
				503	``interval`` apart.
				504	"""
				505	data = sorted(data)
				506	n = len(data)
				507	if n == 0:
				508	raise StatisticsError("no median for empty data")
				509	elif n == 1:
				510	return data[0]
				511	# Find the value at the midpoint. Remember this corresponds to the
				512	# centre of the class interval.
				513	x = data[n//2]
				514	for obj in (x, interval):
				515	if isinstance(obj, (str, bytes)):
				516	raise TypeError('expected number but got %r' % obj)
				517	try:
				518	L = x - interval/2 # The lower limit of the median interval.
				519	except TypeError:
				520	# Mixed type. For now we just coerce to float.
				521	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	522
				523	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	524	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	525	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	526	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	527	# Assuming always l1 <= l2
				528	l2 = _find_rteq(data, l1, x)
				529	cf = l1
				530	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	531	return L + interval*(n/2 - cf)/f
				532
				533
				534	def mode(data):
				535	"""Return the most common data point from discrete or nominal data.
				536
				537	``mode`` assumes discrete data, and returns a single value. This is the
				538	standard treatment of the mode as commonly taught in schools:
				539
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	540	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				541	3
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	542
				543	This also works with nominal (non-numeric) data:
				544
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	545	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				546	'red'
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	547
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	548	If there are multiple modes with same frequency, return the first one
				549	encountered:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	550
				551	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				552	'red'
				553
				554	If data is empty, ``mode``, raises StatisticsError.
				555
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	556	"""
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	557	data = iter(data)
Miss Islington (bot)	8fe4755	2019-09-20 22:18:10 -0700	[diff] [blame]	558	pairs = Counter(data).most_common(1)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	559	try:
Miss Islington (bot)	8fe4755	2019-09-20 22:18:10 -0700	[diff] [blame]	560	return pairs[0][0]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	561	except IndexError:
				562	raise StatisticsError('no mode for empty data') from None
				563
				564
				565	def multimode(data):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	566	"""Return a list of the most frequently occurring values.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	567
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	568	Will return more than one result if there are multiple modes
				569	or an empty list if data is empty.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	570
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	571	>>> multimode('aabbbbbbbbcc')
				572	['b']
				573	>>> multimode('aabbbbccddddeeffffgg')
				574	['b', 'd', 'f']
				575	>>> multimode('')
				576	[]
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	577	"""
				578	counts = Counter(iter(data)).most_common()
				579	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				580	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	581
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	582
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	583	# Notes on methods for computing quantiles
				584	# ----------------------------------------
				585	#
				586	# There is no one perfect way to compute quantiles. Here we offer
				587	# two methods that serve common needs. Most other packages
				588	# surveyed offered at least one or both of these two, making them
				589	# "standard" in the sense of "widely-adopted and reproducible".
				590	# They are also easy to explain, easy to compute manually, and have
				591	# straight-forward interpretations that aren't surprising.
				592
				593	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				594	# value of rank order statistics". The alternative method is known as
				595	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				596
				597	# For sample data where there is a positive probability for values
				598	# beyond the range of the data, the R6 exclusive method is a
				599	# reasonable choice. Consider a random sample of nine values from a
				600	# population with a uniform distribution from 0.0 to 100.0. The
				601	# distribution of the third ranked sample point is described by
				602	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				603	# mean=0.300. Only the latter (which corresponds with R6) gives the
				604	# desired cut point with 30% of the population falling below that
				605	# value, making it comparable to a result from an inv_cdf() function.
Miss Islington (bot)	8fe4755	2019-09-20 22:18:10 -0700	[diff] [blame]	606	# The R6 exclusive method is also idempotent.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	607
				608	# For describing population data where the end points are known to
				609	# be included in the data, the R7 inclusive method is a reasonable
				610	# choice. Instead of the mean, it uses the mode of the beta
				611	# distribution for the interior points. Per Hyndman & Fan, "One nice
				612	# property is that the vertices of Q7(p) divide the range into n - 1
				613	# intervals, and exactly 100p% of the intervals lie to the left of
				614	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				615
Miss Islington (bot)	e5bfd1c	2019-07-19 02:17:53 -0700	[diff] [blame]	616	# If needed, other methods could be added. However, for now, the
				617	# position is that fewer options make for easier choices and that
				618	# external packages can be used for anything more advanced.
Raymond Hettinger	cba9f84	2019-06-02 21:07:43 -0700	[diff] [blame]	619
Miss Islington (bot)	31af1cc	2019-09-17 21:06:53 -0700	[diff] [blame]	620	def quantiles(data, *, n=4, method='exclusive'):
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	621	"""Divide data into n continuous intervals with equal probability.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	622
				623	Returns a list of (n - 1) cut points separating the intervals.
				624
				625	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				626	Set n to 100 for percentiles which gives the 99 cuts points that
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	627	separate data in to 100 equal sized groups.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	628
Raymond Hettinger	cc1bdf9	2019-09-08 18:40:06 -0700	[diff] [blame]	629	The data can be any iterable containing sample.
				630	The cut points are linearly interpolated between data points.
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	631
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	632	If method is set to inclusive, data is treated as population
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	633	data. The minimum value is treated as the 0th percentile and the
				634	maximum value is treated as the 100th percentile.
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	635	"""
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	636	if n < 1:
				637	raise StatisticsError('n must be at least 1')
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	638	data = sorted(data)
Raymond Hettinger	9013ccf	2019-04-23 00:06:35 -0700	[diff] [blame]	639	ld = len(data)
				640	if ld < 2:
				641	raise StatisticsError('must have at least two data points')
				642	if method == 'inclusive':
				643	m = ld - 1
				644	result = []
				645	for i in range(1, n):
				646	j = i * m // n
				647	delta = im - jn
				648	interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
				649	result.append(interpolated)
				650	return result
				651	if method == 'exclusive':
				652	m = ld + 1
				653	result = []
				654	for i in range(1, n):
				655	j = i * m // n # rescale i to m/n
				656	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				657	delta = im - jn # exact integer math
				658	interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
				659	result.append(interpolated)
				660	return result
				661	raise ValueError(f'Unknown method: {method!r}')
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	662
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	663
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	664	# === Measures of spread ===
				665
				666	# See http://mathworld.wolfram.com/Variance.html
				667	# http://mathworld.wolfram.com/SampleVariance.html
				668	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				669	#
				670	# Under no circumstances use the so-called "computational formula for
				671	# variance", as that is only suitable for hand calculations with a small
				672	# amount of low-precision data. It has terrible numeric properties.
				673	#
				674	# See a comparison of three computational methods here:
				675	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				676
				677	def _ss(data, c=None):
				678	"""Return sum of square deviations of sequence data.
				679
				680	If ``c`` is None, the mean is calculated in one pass, and the deviations
				681	from the mean are calculated in a second pass. Otherwise, deviations are
				682	calculated from ``c`` as given. Use the second case with care, as it can
				683	lead to garbage results.
				684	"""
Miss Islington (bot)	811e040	2020-06-13 16:57:17 -0700	[diff] [blame]	685	if c is not None:
				686	T, total, count = _sum((x-c)**2 for x in data)
				687	return (T, total)
				688	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	689	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	690	# The following sum should mathematically equal zero, but due to rounding
				691	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	692	U, total2, count2 = _sum((x-c) for x in data)
				693	assert T == U and count == count2
				694	total -= total2**2/len(data)
				695	assert not total < 0, 'negative sum of square deviations: %f' % total
				696	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	697
				698
				699	def variance(data, xbar=None):
				700	"""Return the sample variance of data.
				701
				702	data should be an iterable of Real-valued numbers, with at least two
				703	values. The optional argument xbar, if given, should be the mean of
				704	the data. If it is missing or None, the mean is automatically calculated.
				705
				706	Use this function when your data is a sample from a population. To
				707	calculate the variance from the entire population, see ``pvariance``.
				708
				709	Examples:
				710
				711	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				712	>>> variance(data)
				713	1.3720238095238095
				714
				715	If you have already calculated the mean of your data, you can pass it as
				716	the optional second argument ``xbar`` to avoid recalculating it:
				717
				718	>>> m = mean(data)
				719	>>> variance(data, m)
				720	1.3720238095238095
				721
				722	This function does not check that ``xbar`` is actually the mean of
				723	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				724	impossible results.
				725
				726	Decimals and Fractions are supported:
				727
				728	>>> from decimal import Decimal as D
				729	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				730	Decimal('31.01875')
				731
				732	>>> from fractions import Fraction as F
				733	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				734	Fraction(67, 108)
				735
				736	"""
				737	if iter(data) is data:
				738	data = list(data)
				739	n = len(data)
				740	if n < 2:
				741	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	742	T, ss = _ss(data, xbar)
				743	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	744
				745
				746	def pvariance(data, mu=None):
				747	"""Return the population variance of ``data``.
				748
Miss Islington (bot)	3562439	2019-11-12 00:04:12 -0800	[diff] [blame]	749	data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	750	value. The optional argument mu, if given, should be the mean of
				751	the data. If it is missing or None, the mean is automatically calculated.
				752
				753	Use this function to calculate the variance from the entire population.
				754	To estimate the variance from a sample, the ``variance`` function is
				755	usually a better choice.
				756
				757	Examples:
				758
				759	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				760	>>> pvariance(data)
				761	1.25
				762
				763	If you have already calculated the mean of the data, you can pass it as
				764	the optional second argument to avoid recalculating it:
				765
				766	>>> mu = mean(data)
				767	>>> pvariance(data, mu)
				768	1.25
				769
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	770	Decimals and Fractions are supported:
				771
				772	>>> from decimal import Decimal as D
				773	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				774	Decimal('24.815')
				775
				776	>>> from fractions import Fraction as F
				777	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				778	Fraction(13, 72)
				779
				780	"""
				781	if iter(data) is data:
				782	data = list(data)
				783	n = len(data)
				784	if n < 1:
				785	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	786	T, ss = _ss(data, mu)
				787	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	788
				789
				790	def stdev(data, xbar=None):
				791	"""Return the square root of the sample variance.
				792
				793	See ``variance`` for arguments and other details.
				794
				795	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				796	1.0810874155219827
				797
				798	"""
				799	var = variance(data, xbar)
				800	try:
				801	return var.sqrt()
				802	except AttributeError:
				803	return math.sqrt(var)
				804
				805
				806	def pstdev(data, mu=None):
				807	"""Return the square root of the population variance.
				808
				809	See ``pvariance`` for arguments and other details.
				810
				811	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				812	0.986893273527251
				813
				814	"""
				815	var = pvariance(data, mu)
				816	try:
				817	return var.sqrt()
				818	except AttributeError:
				819	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	820
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	821
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	822	## Normal Distribution #####################################################
				823
Miss Islington (bot)	5779c53	2019-08-23 15:39:27 -0700	[diff] [blame]	824
				825	def _normal_dist_inv_cdf(p, mu, sigma):
				826	# There is no closed-form solution to the inverse CDF for the normal
				827	# distribution, so we use a rational approximation instead:
				828	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				829	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				830	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				831	q = p - 0.5
				832	if fabs(q) <= 0.425:
				833	r = 0.180625 - q * q
				834	# Hash sum: 55.88319_28806_14901_4439
				835	num = (((((((2.50908_09287_30122_6727e+3 * r +
				836	3.34305_75583_58812_8105e+4) * r +
				837	6.72657_70927_00870_0853e+4) * r +
				838	4.59219_53931_54987_1457e+4) * r +
				839	1.37316_93765_50946_1125e+4) * r +
				840	1.97159_09503_06551_4427e+3) * r +
				841	1.33141_66789_17843_7745e+2) * r +
				842	3.38713_28727_96366_6080e+0) * q
				843	den = (((((((5.22649_52788_52854_5610e+3 * r +
				844	2.87290_85735_72194_2674e+4) * r +
				845	3.93078_95800_09271_0610e+4) * r +
				846	2.12137_94301_58659_5867e+4) * r +
				847	5.39419_60214_24751_1077e+3) * r +
				848	6.87187_00749_20579_0830e+2) * r +
				849	4.23133_30701_60091_1252e+1) * r +
				850	1.0)
				851	x = num / den
				852	return mu + (x * sigma)
				853	r = p if q <= 0.0 else 1.0 - p
				854	r = sqrt(-log(r))
				855	if r <= 5.0:
				856	r = r - 1.6
				857	# Hash sum: 49.33206_50330_16102_89036
				858	num = (((((((7.74545_01427_83414_07640e-4 * r +
				859	2.27238_44989_26918_45833e-2) * r +
				860	2.41780_72517_74506_11770e-1) * r +
				861	1.27045_82524_52368_38258e+0) * r +
				862	3.64784_83247_63204_60504e+0) * r +
				863	5.76949_72214_60691_40550e+0) * r +
				864	4.63033_78461_56545_29590e+0) * r +
				865	1.42343_71107_49683_57734e+0)
				866	den = (((((((1.05075_00716_44416_84324e-9 * r +
				867	5.47593_80849_95344_94600e-4) * r +
				868	1.51986_66563_61645_71966e-2) * r +
				869	1.48103_97642_74800_74590e-1) * r +
				870	6.89767_33498_51000_04550e-1) * r +
				871	1.67638_48301_83803_84940e+0) * r +
				872	2.05319_16266_37758_82187e+0) * r +
				873	1.0)
				874	else:
				875	r = r - 5.0
				876	# Hash sum: 47.52583_31754_92896_71629
				877	num = (((((((2.01033_43992_92288_13265e-7 * r +
				878	2.71155_55687_43487_57815e-5) * r +
				879	1.24266_09473_88078_43860e-3) * r +
				880	2.65321_89526_57612_30930e-2) * r +
				881	2.96560_57182_85048_91230e-1) * r +
				882	1.78482_65399_17291_33580e+0) * r +
				883	5.46378_49111_64114_36990e+0) * r +
				884	6.65790_46435_01103_77720e+0)
				885	den = (((((((2.04426_31033_89939_78564e-15 * r +
				886	1.42151_17583_16445_88870e-7) * r +
				887	1.84631_83175_10054_68180e-5) * r +
				888	7.86869_13114_56132_59100e-4) * r +
				889	1.48753_61290_85061_48525e-2) * r +
				890	1.36929_88092_27358_05310e-1) * r +
				891	5.99832_20655_58879_37690e-1) * r +
				892	1.0)
				893	x = num / den
				894	if q < 0.0:
				895	x = -x
				896	return mu + (x * sigma)
				897
				898
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	899	class NormalDist:
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	900	"Normal distribution of a random variable"
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	901	# https://en.wikipedia.org/wiki/Normal_distribution
				902	# https://en.wikipedia.org/wiki/Variance#Properties
				903
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	904	__slots__ = {
				905	'_mu': 'Arithmetic mean of a normal distribution',
				906	'_sigma': 'Standard deviation of a normal distribution',
				907	}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	908
				909	def __init__(self, mu=0.0, sigma=1.0):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	910	"NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	911	if sigma < 0.0:
				912	raise StatisticsError('sigma must be non-negative')
Miss Islington (bot)	dafbe32	2019-09-05 00:42:22 -0700	[diff] [blame]	913	self._mu = float(mu)
				914	self._sigma = float(sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	915
				916	@classmethod
				917	def from_samples(cls, data):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	918	"Make a normal distribution instance from sample data."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	919	if not isinstance(data, (list, tuple)):
				920	data = list(data)
				921	xbar = fmean(data)
				922	return cls(xbar, stdev(data, xbar))
				923
Raymond Hettinger	fb8c7d5	2019-04-23 01:46:18 -0700	[diff] [blame]	924	def samples(self, n, *, seed=None):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	925	"Generate n samples for a given mean and standard deviation."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	926	gauss = random.gauss if seed is None else random.Random(seed).gauss
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	927	mu, sigma = self._mu, self._sigma
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	928	return [gauss(mu, sigma) for i in range(n)]
				929
				930	def pdf(self, x):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	931	"Probability density function. P(x <= X < x+dx) / dx"
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	932	variance = self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	933	if not variance:
				934	raise StatisticsError('pdf() not defined when sigma is zero')
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	935	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	936
				937	def cdf(self, x):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	938	"Cumulative distribution function. P(X <= x)"
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	939	if not self._sigma:
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	940	raise StatisticsError('cdf() not defined when sigma is zero')
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	941	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	942
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	943	def inv_cdf(self, p):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	944	"""Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	945
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	946	Finds the value of the random variable such that the probability of
				947	the variable being less than or equal to that value equals the given
				948	probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	949
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	950	This function is also called the percent point function or quantile
				951	function.
				952	"""
				953	if p <= 0.0 or p >= 1.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	954	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	955	if self._sigma <= 0.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	956	raise StatisticsError('cdf() not defined when sigma at or below zero')
Miss Islington (bot)	5779c53	2019-08-23 15:39:27 -0700	[diff] [blame]	957	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	958
Raymond Hettinger	cc1bdf9	2019-09-08 18:40:06 -0700	[diff] [blame]	959	def quantiles(self, n=4):
				960	"""Divide into n continuous intervals with equal probability.
				961
				962	Returns a list of (n - 1) cut points separating the intervals.
				963
				964	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				965	Set n to 100 for percentiles which gives the 99 cuts points that
				966	separate the normal distribution in to 100 equal sized groups.
				967	"""
				968	return [self.inv_cdf(i / n) for i in range(1, n)]
				969
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	970	def overlap(self, other):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	971	"""Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	972
				973	Measures the agreement between two normal probability distributions.
				974	Returns a value between 0.0 and 1.0 giving the overlapping area in
				975	the two underlying probability density functions.
				976
				977	>>> N1 = NormalDist(2.4, 1.6)
				978	>>> N2 = NormalDist(3.2, 2.0)
				979	>>> N1.overlap(N2)
				980	0.8035050657330205
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	981	"""
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	982	# See: "The overlapping coefficient as a measure of agreement between
				983	# probability distributions and point estimation of the overlap of two
				984	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				985	# http://dx.doi.org/10.1080/03610928908830127
				986	if not isinstance(other, NormalDist):
				987	raise TypeError('Expected another NormalDist instance')
				988	X, Y = self, other
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	989	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	990	X, Y = Y, X
				991	X_var, Y_var = X.variance, Y.variance
				992	if not X_var or not Y_var:
				993	raise StatisticsError('overlap() not defined when sigma is zero')
				994	dv = Y_var - X_var
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	995	dm = fabs(Y._mu - X._mu)
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	996	if not dv:
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	997	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				998	a = X._mu * Y_var - Y._mu * X_var
				999	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	1000	x1 = (a + b) / dv
				1001	x2 = (a - b) / dv
				1002	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				1003
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1004	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1005	def mean(self):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1006	"Arithmetic mean of the normal distribution."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1007	return self._mu
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1008
				1009	@property
Raymond Hettinger	cc1bdf9	2019-09-08 18:40:06 -0700	[diff] [blame]	1010	def median(self):
				1011	"Return the median of the normal distribution"
				1012	return self._mu
				1013
				1014	@property
				1015	def mode(self):
				1016	"""Return the mode of the normal distribution
				1017
				1018	The mode is the value x where which the probability density
				1019	function (pdf) takes its maximum value.
				1020	"""
				1021	return self._mu
				1022
				1023	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1024	def stdev(self):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1025	"Standard deviation of the normal distribution."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1026	return self._sigma
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	1027
				1028	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1029	def variance(self):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1030	"Square of the standard deviation."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1031	return self._sigma ** 2.0
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1032
				1033	def __add__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1034	"""Add a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1035
				1036	If other is a constant, translate mu by the constant,
				1037	leaving sigma unchanged.
				1038
				1039	If other is a NormalDist, add both the means and the variances.
				1040	Mathematically, this works only if the two distributions are
				1041	independent or if they are jointly normally distributed.
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1042	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1043	if isinstance(x2, NormalDist):
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1044	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1045	return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1046
				1047	def __sub__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1048	"""Subtract a constant or another NormalDist instance.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1049
				1050	If other is a constant, translate by the constant mu,
				1051	leaving sigma unchanged.
				1052
				1053	If other is a NormalDist, subtract the means and add the variances.
				1054	Mathematically, this works only if the two distributions are
				1055	independent or if they are jointly normally distributed.
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1056	"""
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1057	if isinstance(x2, NormalDist):
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1058	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1059	return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1060
				1061	def __mul__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1062	"""Multiply both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1063
				1064	Used for rescaling, perhaps to change measurement units.
				1065	Sigma is scaled with the absolute value of the constant.
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1066	"""
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1067	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1068
				1069	def __truediv__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1070	"""Divide both mu and sigma by a constant.
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	1071
				1072	Used for rescaling, perhaps to change measurement units.
				1073	Sigma is scaled with the absolute value of the constant.
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1074	"""
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1075	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1076
				1077	def __pos__(x1):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1078	"Return a copy of the instance."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1079	return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1080
				1081	def __neg__(x1):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1082	"Negates mu while keeping sigma the same."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1083	return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1084
				1085	__radd__ = __add__
				1086
				1087	def __rsub__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1088	"Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1089	return -(x1 - x2)
				1090
				1091	__rmul__ = __mul__
				1092
				1093	def __eq__(x1, x2):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1094	"Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1095	if not isinstance(x2, NormalDist):
				1096	return NotImplemented
Miss Islington (bot)	652a1cb	2019-10-18 14:41:19 -0700	[diff] [blame]	1097	return x1._mu == x2._mu and x1._sigma == x2._sigma
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1098
				1099	def __hash__(self):
Miss Islington (bot)	e8b3a2e	2019-07-21 12:38:58 -0700	[diff] [blame]	1100	"NormalDist objects hash equal if their mu and sigma are both equal."
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1101	return hash((self._mu, self._sigma))
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1102
				1103	def __repr__(self):
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1104	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1105
Miss Islington (bot)	5779c53	2019-08-23 15:39:27 -0700	[diff] [blame]	1106	# If available, use C implementation
				1107	try:
				1108	from _statistics import _normal_dist_inv_cdf
				1109	except ImportError:
				1110	pass
				1111
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1112
				1113	if __name__ == '__main__':
				1114
				1115	# Show math operations computed analytically in comparsion
				1116	# to a monte carlo simulation of the same operations
				1117
				1118	from math import isclose
				1119	from operator import add, sub, mul, truediv
				1120	from itertools import repeat
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1121	import doctest
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1122
				1123	g1 = NormalDist(10, 20)
				1124	g2 = NormalDist(-5, 25)
				1125
				1126	# Test scaling by a constant
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1127	assert (g1 * 5 / 5).mean == g1.mean
				1128	assert (g1 * 5 / 5).stdev == g1.stdev
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1129
				1130	n = 100_000
				1131	G1 = g1.samples(n)
				1132	G2 = g2.samples(n)
				1133
				1134	for func in (add, sub):
				1135	print(f'\nTest {func.__name__} with another NormalDist:')
				1136	print(func(g1, g2))
				1137	print(NormalDist.from_samples(map(func, G1, G2)))
				1138
				1139	const = 11
				1140	for func in (add, sub, mul, truediv):
				1141	print(f'\nTest {func.__name__} with a constant:')
				1142	print(func(g1, const))
				1143	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				1144
				1145	const = 19
				1146	for func in (add, sub, mul):
				1147	print(f'\nTest constant with {func.__name__}:')
				1148	print(func(const, g1))
				1149	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				1150
				1151	def assert_close(G1, G2):
Miss Islington (bot)	c613c33	2019-07-21 00:55:13 -0700	[diff] [blame]	1152	assert isclose(G1.mean, G1.mean, rel_tol=0.01), (G1, G2)
				1153	assert isclose(G1.stdev, G2.stdev, rel_tol=0.01), (G1, G2)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	1154
				1155	X = NormalDist(-105, 73)
				1156	Y = NormalDist(31, 47)
				1157	s = 32.75
				1158	n = 100_000
				1159
				1160	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				1161	assert_close(X + s, S)
				1162
				1163	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				1164	assert_close(X - s, S)
				1165
				1166	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				1167	assert_close(X * s, S)
				1168
				1169	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				1170	assert_close(X / s, S)
				1171
				1172	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				1173	Y.samples(n))])
				1174	assert_close(X + Y, S)
				1175
				1176	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				1177	Y.samples(n))])
				1178	assert_close(X - Y, S)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1179
				1180	print(doctest.testmod())