Blame - Lib/statistics.py - platform/external/python/cpython3

blob: e917a5dddd8be9ded944d390868467f348bca1fb [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
				10	================== =============================================
				11	Function Description
				12	================== =============================================
				13	mean Arithmetic mean (average) of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	14	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	15	median Median (middle value) of data.
				16	median_low Low median of data.
				17	median_high High median of data.
				18	median_grouped Median, or 50th percentile, of grouped data.
				19	mode Mode (most common value) of data.
				20	================== =============================================
				21
				22	Calculate the arithmetic mean ("the average") of data:
				23
				24	>>> mean([-1.0, 2.5, 3.25, 5.75])
				25	2.625
				26
				27
				28	Calculate the standard median of discrete data:
				29
				30	>>> median([2, 3, 4, 5])
				31	3.5
				32
				33
				34	Calculate the median, or 50th percentile, of data grouped into class intervals
				35	centred on the data values provided. E.g. if your data points are rounded to
				36	the nearest whole number:
				37
				38	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				39	2.8333333333...
				40
				41	This should be interpreted in this way: you have two data points in the class
				42	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				43	the class interval 3.5-4.5. The median of these data points is 2.8333...
				44
				45
				46	Calculating variability or spread
				47	---------------------------------
				48
				49	================== =============================================
				50	Function Description
				51	================== =============================================
				52	pvariance Population variance of data.
				53	variance Sample variance of data.
				54	pstdev Population standard deviation of data.
				55	stdev Sample standard deviation of data.
				56	================== =============================================
				57
				58	Calculate the standard deviation of sample data:
				59
				60	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				61	4.38961843444...
				62
				63	If you have previously calculated the mean, you can pass it as the optional
				64	second argument to the four "spread" functions to avoid recalculating it:
				65
				66	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				67	>>> mu = mean(data)
				68	>>> pvariance(data, mu)
				69	2.5
				70
				71
				72	Exceptions
				73	----------
				74
				75	A single exception is defined: StatisticsError is a subclass of ValueError.
				76
				77	"""
				78
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	79	__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	80	'pstdev', 'pvariance', 'stdev', 'variance',
				81	'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	82	'mean', 'mode', 'harmonic_mean', 'fmean',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	83	]
				84
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	85	import collections
				86	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	87	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	88	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	89
				90	from fractions import Fraction
				91	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	92	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	93	from bisect import bisect_left, bisect_right
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	94	from math import hypot, sqrt, fabs, exp, erf, tau
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	95
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	96
				97
				98	# === Exceptions ===
				99
				100	class StatisticsError(ValueError):
				101	pass
				102
				103
				104	# === Private utilities ===
				105
				106	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	107	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	108
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	109	Return a high-precision sum of the given numeric data as a fraction,
				110	together with the type to be converted to and the count of items.
				111
				112	If optional argument ``start`` is given, it is added to the total.
				113	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115
				116	Examples
				117	--------
				118
				119	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	120	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	121
				122	Some sources of round-off error will be avoided:
				123
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	124	# Built-in sum returns zero.
				125	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	126	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	127
				128	Fractions and Decimals are also supported:
				129
				130	>>> from fractions import Fraction as F
				131	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	132	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	133
				134	>>> from decimal import Decimal as D
				135	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				136	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	139	Mixed types are currently treated as an error, except that int is
				140	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	141	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	142	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	143	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	144	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	145	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	146	T = _coerce(int, type(start))
				147	for typ, values in groupby(data, type):
				148	T = _coerce(T, typ) # or raise TypeError
				149	for n,d in map(_exact_ratio, values):
				150	count += 1
				151	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	152	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	153	# The sum will be a NAN or INF. We can ignore all the finite
				154	# partials, and just look at this special one.
				155	total = partials[None]
				156	assert not _isfinite(total)
				157	else:
				158	# Sum all the partial sums using builtin sum.
				159	# FIXME is this faster if we sum them in order of the denominator?
				160	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				161	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162
				163
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	164	def _isfinite(x):
				165	try:
				166	return x.is_finite() # Likely a Decimal.
				167	except AttributeError:
				168	return math.isfinite(x) # Coerces to float first.
				169
				170
				171	def _coerce(T, S):
				172	"""Coerce types T and S to a common type, or raise TypeError.
				173
				174	Coercion rules are currently an implementation detail. See the CoerceTest
				175	test class in test_statistics for details.
				176	"""
				177	# See http://bugs.python.org/issue24068.
				178	assert T is not bool, "initial type T is bool"
				179	# If the types are the same, no need to coerce anything. Put this
				180	# first, so that the usual case (no coercion needed) happens as soon
				181	# as possible.
				182	if T is S: return T
				183	# Mixed int & other coerce to the other type.
				184	if S is int or S is bool: return T
				185	if T is int: return S
				186	# If one is a (strict) subclass of the other, coerce to the subclass.
				187	if issubclass(S, T): return S
				188	if issubclass(T, S): return T
				189	# Ints coerce to the other type.
				190	if issubclass(T, int): return S
				191	if issubclass(S, int): return T
				192	# Mixed fraction & float coerces to float (or float subclass).
				193	if issubclass(T, Fraction) and issubclass(S, float):
				194	return S
				195	if issubclass(T, float) and issubclass(S, Fraction):
				196	return T
				197	# Any other combination is disallowed.
				198	msg = "don't know how to coerce %s and %s"
				199	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	200
				201
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	202	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	203	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	204
				205	>>> _exact_ratio(0.25)
				206	(1, 4)
				207
				208	x is expected to be an int, Fraction, Decimal or float.
				209	"""
				210	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	211	# Optimise the common case of floats. We expect that the most often
				212	# used numeric type will be builtin floats, so try to make this as
				213	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	214	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	215	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	216	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	217	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	218	return (x.numerator, x.denominator)
				219	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	220	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	221	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	222	return x.as_integer_ratio()
				223	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	224	# Just give up?
				225	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	226	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	227	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	228	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	229	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	230	msg = "can't convert type '{}' to numerator/denominator"
				231	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	232
				233
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	def _convert(value, T):
				235	"""Convert value to given numeric type T."""
				236	if type(value) is T:
				237	# This covers the cases where T is Fraction, or where value is
				238	# a NAN or INF (Decimal or float).
				239	return value
				240	if issubclass(T, int) and value.denominator != 1:
				241	T = float
				242	try:
				243	# FIXME: what do we do if this overflows?
				244	return T(value)
				245	except TypeError:
				246	if issubclass(T, Decimal):
				247	return T(value.numerator)/T(value.denominator)
				248	else:
				249	raise
				250
				251
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	252	def _counts(data):
				253	# Generate a table of sorted (value, frequency) pairs.
Nick Coghlan	bfd68bf	2014-02-08 19:44:16 +1000	[diff] [blame]	254	table = collections.Counter(iter(data)).most_common()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	255	if not table:
				256	return table
				257	# Extract the values with the highest frequency.
				258	maxfreq = table[0][1]
				259	for i in range(1, len(table)):
				260	if table[i][1] != maxfreq:
				261	table = table[:i]
				262	break
				263	return table
				264
				265
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	266	def _find_lteq(a, x):
				267	'Locate the leftmost value exactly equal to x'
				268	i = bisect_left(a, x)
				269	if i != len(a) and a[i] == x:
				270	return i
				271	raise ValueError
				272
				273
				274	def _find_rteq(a, l, x):
				275	'Locate the rightmost value exactly equal to x'
				276	i = bisect_right(a, x, lo=l)
				277	if i != (len(a)+1) and a[i-1] == x:
				278	return i-1
				279	raise ValueError
				280
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	281
				282	def _fail_neg(values, errmsg='negative value'):
				283	"""Iterate over values, failing if any are less than zero."""
				284	for x in values:
				285	if x < 0:
				286	raise StatisticsError(errmsg)
				287	yield x
				288
				289
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	290	# === Measures of central tendency (averages) ===
				291
				292	def mean(data):
				293	"""Return the sample arithmetic mean of data.
				294
				295	>>> mean([1, 2, 3, 4, 4])
				296	2.8
				297
				298	>>> from fractions import Fraction as F
				299	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				300	Fraction(13, 21)
				301
				302	>>> from decimal import Decimal as D
				303	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				304	Decimal('0.5625')
				305
				306	If ``data`` is empty, StatisticsError will be raised.
				307	"""
				308	if iter(data) is data:
				309	data = list(data)
				310	n = len(data)
				311	if n < 1:
				312	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	313	T, total, count = _sum(data)
				314	assert count == n
				315	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	316
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	317	def fmean(data):
				318	""" Convert data to floats and compute the arithmetic mean.
				319
				320	This runs faster than the mean() function and it always returns a float.
				321	The result is highly accurate but not as perfect as mean().
				322	If the input dataset is empty, it raises a StatisticsError.
				323
				324	>>> fmean([3.5, 4.0, 5.25])
				325	4.25
				326
				327	"""
				328	try:
				329	n = len(data)
				330	except TypeError:
				331	# Handle iterators that do not define __len__().
				332	n = 0
				333	def count(x):
				334	nonlocal n
				335	n += 1
				336	return x
				337	total = math.fsum(map(count, data))
				338	else:
				339	total = math.fsum(data)
				340	try:
				341	return total / n
				342	except ZeroDivisionError:
				343	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	344
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	345	def harmonic_mean(data):
				346	"""Return the harmonic mean of data.
				347
				348	The harmonic mean, sometimes called the subcontrary mean, is the
				349	reciprocal of the arithmetic mean of the reciprocals of the data,
				350	and is often appropriate when averaging quantities which are rates
				351	or ratios, for example speeds. Example:
				352
				353	Suppose an investor purchases an equal value of shares in each of
				354	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				355	What is the average P/E ratio for the investor's portfolio?
				356
				357	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				358	3.6
				359
				360	Using the arithmetic mean would give an average of about 5.167, which
				361	is too high.
				362
				363	If ``data`` is empty, or any element is less than zero,
				364	``harmonic_mean`` will raise ``StatisticsError``.
				365	"""
				366	# For a justification for using harmonic mean for P/E ratios, see
				367	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				368	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				369	if iter(data) is data:
				370	data = list(data)
				371	errmsg = 'harmonic mean does not support negative values'
				372	n = len(data)
				373	if n < 1:
				374	raise StatisticsError('harmonic_mean requires at least one data point')
				375	elif n == 1:
				376	x = data[0]
				377	if isinstance(x, (numbers.Real, Decimal)):
				378	if x < 0:
				379	raise StatisticsError(errmsg)
				380	return x
				381	else:
				382	raise TypeError('unsupported type')
				383	try:
				384	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				385	except ZeroDivisionError:
				386	return 0
				387	assert count == n
				388	return _convert(n/total, T)
				389
				390
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	391	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				392	def median(data):
				393	"""Return the median (middle value) of numeric data.
				394
				395	When the number of data points is odd, return the middle data point.
				396	When the number of data points is even, the median is interpolated by
				397	taking the average of the two middle values:
				398
				399	>>> median([1, 3, 5])
				400	3
				401	>>> median([1, 3, 5, 7])
				402	4.0
				403
				404	"""
				405	data = sorted(data)
				406	n = len(data)
				407	if n == 0:
				408	raise StatisticsError("no median for empty data")
				409	if n%2 == 1:
				410	return data[n//2]
				411	else:
				412	i = n//2
				413	return (data[i - 1] + data[i])/2
				414
				415
				416	def median_low(data):
				417	"""Return the low median of numeric data.
				418
				419	When the number of data points is odd, the middle value is returned.
				420	When it is even, the smaller of the two middle values is returned.
				421
				422	>>> median_low([1, 3, 5])
				423	3
				424	>>> median_low([1, 3, 5, 7])
				425	3
				426
				427	"""
				428	data = sorted(data)
				429	n = len(data)
				430	if n == 0:
				431	raise StatisticsError("no median for empty data")
				432	if n%2 == 1:
				433	return data[n//2]
				434	else:
				435	return data[n//2 - 1]
				436
				437
				438	def median_high(data):
				439	"""Return the high median of data.
				440
				441	When the number of data points is odd, the middle value is returned.
				442	When it is even, the larger of the two middle values is returned.
				443
				444	>>> median_high([1, 3, 5])
				445	3
				446	>>> median_high([1, 3, 5, 7])
				447	5
				448
				449	"""
				450	data = sorted(data)
				451	n = len(data)
				452	if n == 0:
				453	raise StatisticsError("no median for empty data")
				454	return data[n//2]
				455
				456
				457	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	458	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	459
				460	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				461	3.7
				462	>>> median_grouped([52, 52, 53, 54])
				463	52.5
				464
				465	This calculates the median as the 50th percentile, and should be
				466	used when your data is continuous and grouped. In the above example,
				467	the values 1, 2, 3, etc. actually represent the midpoint of classes
				468	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				469	class 3.5-4.5, and interpolation is used to estimate it.
				470
				471	Optional argument ``interval`` represents the class interval, and
				472	defaults to 1. Changing the class interval naturally will change the
				473	interpolated 50th percentile value:
				474
				475	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				476	3.25
				477	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				478	3.5
				479
				480	This function does not check whether the data points are at least
				481	``interval`` apart.
				482	"""
				483	data = sorted(data)
				484	n = len(data)
				485	if n == 0:
				486	raise StatisticsError("no median for empty data")
				487	elif n == 1:
				488	return data[0]
				489	# Find the value at the midpoint. Remember this corresponds to the
				490	# centre of the class interval.
				491	x = data[n//2]
				492	for obj in (x, interval):
				493	if isinstance(obj, (str, bytes)):
				494	raise TypeError('expected number but got %r' % obj)
				495	try:
				496	L = x - interval/2 # The lower limit of the median interval.
				497	except TypeError:
				498	# Mixed type. For now we just coerce to float.
				499	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	500
				501	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	502	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	503	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	504	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	505	# Assuming always l1 <= l2
				506	l2 = _find_rteq(data, l1, x)
				507	cf = l1
				508	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	509	return L + interval*(n/2 - cf)/f
				510
				511
				512	def mode(data):
				513	"""Return the most common data point from discrete or nominal data.
				514
				515	``mode`` assumes discrete data, and returns a single value. This is the
				516	standard treatment of the mode as commonly taught in schools:
				517
				518	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				519	3
				520
				521	This also works with nominal (non-numeric) data:
				522
				523	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				524	'red'
				525
				526	If there is not exactly one most common value, ``mode`` will raise
				527	StatisticsError.
				528	"""
				529	# Generate a table of sorted (value, frequency) pairs.
				530	table = _counts(data)
				531	if len(table) == 1:
				532	return table[0][0]
				533	elif table:
				534	raise StatisticsError(
				535	'no unique mode; found %d equally common values' % len(table)
				536	)
				537	else:
				538	raise StatisticsError('no mode for empty data')
				539
				540
				541	# === Measures of spread ===
				542
				543	# See http://mathworld.wolfram.com/Variance.html
				544	# http://mathworld.wolfram.com/SampleVariance.html
				545	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				546	#
				547	# Under no circumstances use the so-called "computational formula for
				548	# variance", as that is only suitable for hand calculations with a small
				549	# amount of low-precision data. It has terrible numeric properties.
				550	#
				551	# See a comparison of three computational methods here:
				552	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				553
				554	def _ss(data, c=None):
				555	"""Return sum of square deviations of sequence data.
				556
				557	If ``c`` is None, the mean is calculated in one pass, and the deviations
				558	from the mean are calculated in a second pass. Otherwise, deviations are
				559	calculated from ``c`` as given. Use the second case with care, as it can
				560	lead to garbage results.
				561	"""
				562	if c is None:
				563	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	564	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	565	# The following sum should mathematically equal zero, but due to rounding
				566	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	567	U, total2, count2 = _sum((x-c) for x in data)
				568	assert T == U and count == count2
				569	total -= total2**2/len(data)
				570	assert not total < 0, 'negative sum of square deviations: %f' % total
				571	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	572
				573
				574	def variance(data, xbar=None):
				575	"""Return the sample variance of data.
				576
				577	data should be an iterable of Real-valued numbers, with at least two
				578	values. The optional argument xbar, if given, should be the mean of
				579	the data. If it is missing or None, the mean is automatically calculated.
				580
				581	Use this function when your data is a sample from a population. To
				582	calculate the variance from the entire population, see ``pvariance``.
				583
				584	Examples:
				585
				586	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				587	>>> variance(data)
				588	1.3720238095238095
				589
				590	If you have already calculated the mean of your data, you can pass it as
				591	the optional second argument ``xbar`` to avoid recalculating it:
				592
				593	>>> m = mean(data)
				594	>>> variance(data, m)
				595	1.3720238095238095
				596
				597	This function does not check that ``xbar`` is actually the mean of
				598	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				599	impossible results.
				600
				601	Decimals and Fractions are supported:
				602
				603	>>> from decimal import Decimal as D
				604	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				605	Decimal('31.01875')
				606
				607	>>> from fractions import Fraction as F
				608	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				609	Fraction(67, 108)
				610
				611	"""
				612	if iter(data) is data:
				613	data = list(data)
				614	n = len(data)
				615	if n < 2:
				616	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	617	T, ss = _ss(data, xbar)
				618	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	619
				620
				621	def pvariance(data, mu=None):
				622	"""Return the population variance of ``data``.
				623
				624	data should be an iterable of Real-valued numbers, with at least one
				625	value. The optional argument mu, if given, should be the mean of
				626	the data. If it is missing or None, the mean is automatically calculated.
				627
				628	Use this function to calculate the variance from the entire population.
				629	To estimate the variance from a sample, the ``variance`` function is
				630	usually a better choice.
				631
				632	Examples:
				633
				634	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				635	>>> pvariance(data)
				636	1.25
				637
				638	If you have already calculated the mean of the data, you can pass it as
				639	the optional second argument to avoid recalculating it:
				640
				641	>>> mu = mean(data)
				642	>>> pvariance(data, mu)
				643	1.25
				644
				645	This function does not check that ``mu`` is actually the mean of ``data``.
				646	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				647	results.
				648
				649	Decimals and Fractions are supported:
				650
				651	>>> from decimal import Decimal as D
				652	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				653	Decimal('24.815')
				654
				655	>>> from fractions import Fraction as F
				656	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				657	Fraction(13, 72)
				658
				659	"""
				660	if iter(data) is data:
				661	data = list(data)
				662	n = len(data)
				663	if n < 1:
				664	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	665	T, ss = _ss(data, mu)
				666	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	667
				668
				669	def stdev(data, xbar=None):
				670	"""Return the square root of the sample variance.
				671
				672	See ``variance`` for arguments and other details.
				673
				674	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				675	1.0810874155219827
				676
				677	"""
				678	var = variance(data, xbar)
				679	try:
				680	return var.sqrt()
				681	except AttributeError:
				682	return math.sqrt(var)
				683
				684
				685	def pstdev(data, mu=None):
				686	"""Return the square root of the population variance.
				687
				688	See ``pvariance`` for arguments and other details.
				689
				690	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				691	0.986893273527251
				692
				693	"""
				694	var = pvariance(data, mu)
				695	try:
				696	return var.sqrt()
				697	except AttributeError:
				698	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	699
				700	## Normal Distribution #####################################################
				701
				702	class NormalDist:
				703	'Normal distribution of a random variable'
				704	# https://en.wikipedia.org/wiki/Normal_distribution
				705	# https://en.wikipedia.org/wiki/Variance#Properties
				706
				707	__slots__ = ('mu', 'sigma')
				708
				709	def __init__(self, mu=0.0, sigma=1.0):
				710	'NormalDist where mu is the mean and sigma is the standard deviation'
				711	if sigma < 0.0:
				712	raise StatisticsError('sigma must be non-negative')
				713	self.mu = mu
				714	self.sigma = sigma
				715
				716	@classmethod
				717	def from_samples(cls, data):
				718	'Make a normal distribution instance from sample data'
				719	if not isinstance(data, (list, tuple)):
				720	data = list(data)
				721	xbar = fmean(data)
				722	return cls(xbar, stdev(data, xbar))
				723
				724	def samples(self, n, seed=None):
				725	'Generate n samples for a given mean and standard deviation'
				726	gauss = random.gauss if seed is None else random.Random(seed).gauss
				727	mu, sigma = self.mu, self.sigma
				728	return [gauss(mu, sigma) for i in range(n)]
				729
				730	def pdf(self, x):
				731	'Probability density function: P(x <= X < x+dx) / dx'
				732	variance = self.sigma ** 2.0
				733	if not variance:
				734	raise StatisticsError('pdf() not defined when sigma is zero')
				735	return exp((x - self.mu)*2.0 / (-2.0variance)) / sqrt(tau * variance)
				736
				737	def cdf(self, x):
Raymond Hettinger	ef17fdb	2019-02-28 09:16:25 -0800	[diff] [blame^]	738	'Cumulative distribution function: P(X <= x)'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	739	if not self.sigma:
				740	raise StatisticsError('cdf() not defined when sigma is zero')
				741	return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
				742
				743	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	744	def mean(self):
				745	'Arithmetic mean of the normal distribution'
				746	return self.mu
				747
				748	@property
				749	def stdev(self):
				750	'Standard deviation of the normal distribution'
				751	return self.sigma
				752
				753	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	754	def variance(self):
				755	'Square of the standard deviation'
				756	return self.sigma ** 2.0
				757
				758	def __add__(x1, x2):
				759	if isinstance(x2, NormalDist):
				760	return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
				761	return NormalDist(x1.mu + x2, x1.sigma)
				762
				763	def __sub__(x1, x2):
				764	if isinstance(x2, NormalDist):
				765	return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
				766	return NormalDist(x1.mu - x2, x1.sigma)
				767
				768	def __mul__(x1, x2):
				769	return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
				770
				771	def __truediv__(x1, x2):
				772	return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
				773
				774	def __pos__(x1):
Raymond Hettinger	79fbcc5	2019-02-23 22:19:01 -0800	[diff] [blame]	775	return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	776
				777	def __neg__(x1):
				778	return NormalDist(-x1.mu, x1.sigma)
				779
				780	__radd__ = __add__
				781
				782	def __rsub__(x1, x2):
				783	return -(x1 - x2)
				784
				785	__rmul__ = __mul__
				786
				787	def __eq__(x1, x2):
				788	if not isinstance(x2, NormalDist):
				789	return NotImplemented
				790	return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
				791
				792	def __repr__(self):
				793	return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
				794
				795
				796	if __name__ == '__main__':
				797
				798	# Show math operations computed analytically in comparsion
				799	# to a monte carlo simulation of the same operations
				800
				801	from math import isclose
				802	from operator import add, sub, mul, truediv
				803	from itertools import repeat
				804
				805	g1 = NormalDist(10, 20)
				806	g2 = NormalDist(-5, 25)
				807
				808	# Test scaling by a constant
				809	assert (g1 * 5 / 5).mu == g1.mu
				810	assert (g1 * 5 / 5).sigma == g1.sigma
				811
				812	n = 100_000
				813	G1 = g1.samples(n)
				814	G2 = g2.samples(n)
				815
				816	for func in (add, sub):
				817	print(f'\nTest {func.__name__} with another NormalDist:')
				818	print(func(g1, g2))
				819	print(NormalDist.from_samples(map(func, G1, G2)))
				820
				821	const = 11
				822	for func in (add, sub, mul, truediv):
				823	print(f'\nTest {func.__name__} with a constant:')
				824	print(func(g1, const))
				825	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				826
				827	const = 19
				828	for func in (add, sub, mul):
				829	print(f'\nTest constant with {func.__name__}:')
				830	print(func(const, g1))
				831	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				832
				833	def assert_close(G1, G2):
				834	assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
				835	assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
				836
				837	X = NormalDist(-105, 73)
				838	Y = NormalDist(31, 47)
				839	s = 32.75
				840	n = 100_000
				841
				842	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				843	assert_close(X + s, S)
				844
				845	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				846	assert_close(X - s, S)
				847
				848	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				849	assert_close(X * s, S)
				850
				851	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				852	assert_close(X / s, S)
				853
				854	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				855	Y.samples(n))])
				856	assert_close(X + Y, S)
				857
				858	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				859	Y.samples(n))])
				860	assert_close(X - Y, S)