Blame - Lib/statistics.py - platform/external/python/cpython3

blob: bd8a6f96381a72e9d401445131903cb9728e4dcb [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
				10	================== =============================================
				11	Function Description
				12	================== =============================================
				13	mean Arithmetic mean (average) of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	14	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	15	median Median (middle value) of data.
				16	median_low Low median of data.
				17	median_high High median of data.
				18	median_grouped Median, or 50th percentile, of grouped data.
				19	mode Mode (most common value) of data.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	20	multimode List of modes (most common values of data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	21	================== =============================================
				22
				23	Calculate the arithmetic mean ("the average") of data:
				24
				25	>>> mean([-1.0, 2.5, 3.25, 5.75])
				26	2.625
				27
				28
				29	Calculate the standard median of discrete data:
				30
				31	>>> median([2, 3, 4, 5])
				32	3.5
				33
				34
				35	Calculate the median, or 50th percentile, of data grouped into class intervals
				36	centred on the data values provided. E.g. if your data points are rounded to
				37	the nearest whole number:
				38
				39	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				40	2.8333333333...
				41
				42	This should be interpreted in this way: you have two data points in the class
				43	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				44	the class interval 3.5-4.5. The median of these data points is 2.8333...
				45
				46
				47	Calculating variability or spread
				48	---------------------------------
				49
				50	================== =============================================
				51	Function Description
				52	================== =============================================
				53	pvariance Population variance of data.
				54	variance Sample variance of data.
				55	pstdev Population standard deviation of data.
				56	stdev Sample standard deviation of data.
				57	================== =============================================
				58
				59	Calculate the standard deviation of sample data:
				60
				61	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				62	4.38961843444...
				63
				64	If you have previously calculated the mean, you can pass it as the optional
				65	second argument to the four "spread" functions to avoid recalculating it:
				66
				67	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				68	>>> mu = mean(data)
				69	>>> pvariance(data, mu)
				70	2.5
				71
				72
				73	Exceptions
				74	----------
				75
				76	A single exception is defined: StatisticsError is a subclass of ValueError.
				77
				78	"""
				79
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	80	__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	81	'pstdev', 'pvariance', 'stdev', 'variance',
				82	'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	83	'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	84	]
				85
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	86	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	87	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	88	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	89
				90	from fractions import Fraction
				91	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	92	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	93	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	94	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	95	from operator import itemgetter
				96	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	97
				98	# === Exceptions ===
				99
				100	class StatisticsError(ValueError):
				101	pass
				102
				103
				104	# === Private utilities ===
				105
				106	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	107	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	108
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	109	Return a high-precision sum of the given numeric data as a fraction,
				110	together with the type to be converted to and the count of items.
				111
				112	If optional argument ``start`` is given, it is added to the total.
				113	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115
				116	Examples
				117	--------
				118
				119	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	120	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	121
				122	Some sources of round-off error will be avoided:
				123
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	124	# Built-in sum returns zero.
				125	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	126	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	127
				128	Fractions and Decimals are also supported:
				129
				130	>>> from fractions import Fraction as F
				131	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	132	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	133
				134	>>> from decimal import Decimal as D
				135	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				136	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	139	Mixed types are currently treated as an error, except that int is
				140	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	141	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	142	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	143	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	144	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	145	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	146	T = _coerce(int, type(start))
				147	for typ, values in groupby(data, type):
				148	T = _coerce(T, typ) # or raise TypeError
				149	for n,d in map(_exact_ratio, values):
				150	count += 1
				151	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	152	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	153	# The sum will be a NAN or INF. We can ignore all the finite
				154	# partials, and just look at this special one.
				155	total = partials[None]
				156	assert not _isfinite(total)
				157	else:
				158	# Sum all the partial sums using builtin sum.
				159	# FIXME is this faster if we sum them in order of the denominator?
				160	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				161	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162
				163
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	164	def _isfinite(x):
				165	try:
				166	return x.is_finite() # Likely a Decimal.
				167	except AttributeError:
				168	return math.isfinite(x) # Coerces to float first.
				169
				170
				171	def _coerce(T, S):
				172	"""Coerce types T and S to a common type, or raise TypeError.
				173
				174	Coercion rules are currently an implementation detail. See the CoerceTest
				175	test class in test_statistics for details.
				176	"""
				177	# See http://bugs.python.org/issue24068.
				178	assert T is not bool, "initial type T is bool"
				179	# If the types are the same, no need to coerce anything. Put this
				180	# first, so that the usual case (no coercion needed) happens as soon
				181	# as possible.
				182	if T is S: return T
				183	# Mixed int & other coerce to the other type.
				184	if S is int or S is bool: return T
				185	if T is int: return S
				186	# If one is a (strict) subclass of the other, coerce to the subclass.
				187	if issubclass(S, T): return S
				188	if issubclass(T, S): return T
				189	# Ints coerce to the other type.
				190	if issubclass(T, int): return S
				191	if issubclass(S, int): return T
				192	# Mixed fraction & float coerces to float (or float subclass).
				193	if issubclass(T, Fraction) and issubclass(S, float):
				194	return S
				195	if issubclass(T, float) and issubclass(S, Fraction):
				196	return T
				197	# Any other combination is disallowed.
				198	msg = "don't know how to coerce %s and %s"
				199	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	200
				201
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	202	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	203	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	204
				205	>>> _exact_ratio(0.25)
				206	(1, 4)
				207
				208	x is expected to be an int, Fraction, Decimal or float.
				209	"""
				210	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	211	# Optimise the common case of floats. We expect that the most often
				212	# used numeric type will be builtin floats, so try to make this as
				213	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	214	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	215	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	216	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	217	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	218	return (x.numerator, x.denominator)
				219	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	220	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	221	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	222	return x.as_integer_ratio()
				223	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	224	# Just give up?
				225	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	226	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	227	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	228	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	229	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	230	msg = "can't convert type '{}' to numerator/denominator"
				231	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	232
				233
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	def _convert(value, T):
				235	"""Convert value to given numeric type T."""
				236	if type(value) is T:
				237	# This covers the cases where T is Fraction, or where value is
				238	# a NAN or INF (Decimal or float).
				239	return value
				240	if issubclass(T, int) and value.denominator != 1:
				241	T = float
				242	try:
				243	# FIXME: what do we do if this overflows?
				244	return T(value)
				245	except TypeError:
				246	if issubclass(T, Decimal):
				247	return T(value.numerator)/T(value.denominator)
				248	else:
				249	raise
				250
				251
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	252	def _find_lteq(a, x):
				253	'Locate the leftmost value exactly equal to x'
				254	i = bisect_left(a, x)
				255	if i != len(a) and a[i] == x:
				256	return i
				257	raise ValueError
				258
				259
				260	def _find_rteq(a, l, x):
				261	'Locate the rightmost value exactly equal to x'
				262	i = bisect_right(a, x, lo=l)
				263	if i != (len(a)+1) and a[i-1] == x:
				264	return i-1
				265	raise ValueError
				266
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	267
				268	def _fail_neg(values, errmsg='negative value'):
				269	"""Iterate over values, failing if any are less than zero."""
				270	for x in values:
				271	if x < 0:
				272	raise StatisticsError(errmsg)
				273	yield x
				274
				275
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	276	# === Measures of central tendency (averages) ===
				277
				278	def mean(data):
				279	"""Return the sample arithmetic mean of data.
				280
				281	>>> mean([1, 2, 3, 4, 4])
				282	2.8
				283
				284	>>> from fractions import Fraction as F
				285	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				286	Fraction(13, 21)
				287
				288	>>> from decimal import Decimal as D
				289	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				290	Decimal('0.5625')
				291
				292	If ``data`` is empty, StatisticsError will be raised.
				293	"""
				294	if iter(data) is data:
				295	data = list(data)
				296	n = len(data)
				297	if n < 1:
				298	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	299	T, total, count = _sum(data)
				300	assert count == n
				301	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	302
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	303	def fmean(data):
				304	""" Convert data to floats and compute the arithmetic mean.
				305
				306	This runs faster than the mean() function and it always returns a float.
				307	The result is highly accurate but not as perfect as mean().
				308	If the input dataset is empty, it raises a StatisticsError.
				309
				310	>>> fmean([3.5, 4.0, 5.25])
				311	4.25
				312
				313	"""
				314	try:
				315	n = len(data)
				316	except TypeError:
				317	# Handle iterators that do not define __len__().
				318	n = 0
				319	def count(x):
				320	nonlocal n
				321	n += 1
				322	return x
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	323	total = fsum(map(count, data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	324	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	325	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	326	try:
				327	return total / n
				328	except ZeroDivisionError:
				329	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	330
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	331	def harmonic_mean(data):
				332	"""Return the harmonic mean of data.
				333
				334	The harmonic mean, sometimes called the subcontrary mean, is the
				335	reciprocal of the arithmetic mean of the reciprocals of the data,
				336	and is often appropriate when averaging quantities which are rates
				337	or ratios, for example speeds. Example:
				338
				339	Suppose an investor purchases an equal value of shares in each of
				340	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				341	What is the average P/E ratio for the investor's portfolio?
				342
				343	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				344	3.6
				345
				346	Using the arithmetic mean would give an average of about 5.167, which
				347	is too high.
				348
				349	If ``data`` is empty, or any element is less than zero,
				350	``harmonic_mean`` will raise ``StatisticsError``.
				351	"""
				352	# For a justification for using harmonic mean for P/E ratios, see
				353	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				354	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				355	if iter(data) is data:
				356	data = list(data)
				357	errmsg = 'harmonic mean does not support negative values'
				358	n = len(data)
				359	if n < 1:
				360	raise StatisticsError('harmonic_mean requires at least one data point')
				361	elif n == 1:
				362	x = data[0]
				363	if isinstance(x, (numbers.Real, Decimal)):
				364	if x < 0:
				365	raise StatisticsError(errmsg)
				366	return x
				367	else:
				368	raise TypeError('unsupported type')
				369	try:
				370	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				371	except ZeroDivisionError:
				372	return 0
				373	assert count == n
				374	return _convert(n/total, T)
				375
				376
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	377	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				378	def median(data):
				379	"""Return the median (middle value) of numeric data.
				380
				381	When the number of data points is odd, return the middle data point.
				382	When the number of data points is even, the median is interpolated by
				383	taking the average of the two middle values:
				384
				385	>>> median([1, 3, 5])
				386	3
				387	>>> median([1, 3, 5, 7])
				388	4.0
				389
				390	"""
				391	data = sorted(data)
				392	n = len(data)
				393	if n == 0:
				394	raise StatisticsError("no median for empty data")
				395	if n%2 == 1:
				396	return data[n//2]
				397	else:
				398	i = n//2
				399	return (data[i - 1] + data[i])/2
				400
				401
				402	def median_low(data):
				403	"""Return the low median of numeric data.
				404
				405	When the number of data points is odd, the middle value is returned.
				406	When it is even, the smaller of the two middle values is returned.
				407
				408	>>> median_low([1, 3, 5])
				409	3
				410	>>> median_low([1, 3, 5, 7])
				411	3
				412
				413	"""
				414	data = sorted(data)
				415	n = len(data)
				416	if n == 0:
				417	raise StatisticsError("no median for empty data")
				418	if n%2 == 1:
				419	return data[n//2]
				420	else:
				421	return data[n//2 - 1]
				422
				423
				424	def median_high(data):
				425	"""Return the high median of data.
				426
				427	When the number of data points is odd, the middle value is returned.
				428	When it is even, the larger of the two middle values is returned.
				429
				430	>>> median_high([1, 3, 5])
				431	3
				432	>>> median_high([1, 3, 5, 7])
				433	5
				434
				435	"""
				436	data = sorted(data)
				437	n = len(data)
				438	if n == 0:
				439	raise StatisticsError("no median for empty data")
				440	return data[n//2]
				441
				442
				443	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	444	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	445
				446	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				447	3.7
				448	>>> median_grouped([52, 52, 53, 54])
				449	52.5
				450
				451	This calculates the median as the 50th percentile, and should be
				452	used when your data is continuous and grouped. In the above example,
				453	the values 1, 2, 3, etc. actually represent the midpoint of classes
				454	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				455	class 3.5-4.5, and interpolation is used to estimate it.
				456
				457	Optional argument ``interval`` represents the class interval, and
				458	defaults to 1. Changing the class interval naturally will change the
				459	interpolated 50th percentile value:
				460
				461	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				462	3.25
				463	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				464	3.5
				465
				466	This function does not check whether the data points are at least
				467	``interval`` apart.
				468	"""
				469	data = sorted(data)
				470	n = len(data)
				471	if n == 0:
				472	raise StatisticsError("no median for empty data")
				473	elif n == 1:
				474	return data[0]
				475	# Find the value at the midpoint. Remember this corresponds to the
				476	# centre of the class interval.
				477	x = data[n//2]
				478	for obj in (x, interval):
				479	if isinstance(obj, (str, bytes)):
				480	raise TypeError('expected number but got %r' % obj)
				481	try:
				482	L = x - interval/2 # The lower limit of the median interval.
				483	except TypeError:
				484	# Mixed type. For now we just coerce to float.
				485	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	486
				487	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	488	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	489	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	490	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	491	# Assuming always l1 <= l2
				492	l2 = _find_rteq(data, l1, x)
				493	cf = l1
				494	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	495	return L + interval*(n/2 - cf)/f
				496
				497
				498	def mode(data):
				499	"""Return the most common data point from discrete or nominal data.
				500
				501	``mode`` assumes discrete data, and returns a single value. This is the
				502	standard treatment of the mode as commonly taught in schools:
				503
				504	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				505	3
				506
				507	This also works with nominal (non-numeric) data:
				508
				509	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				510	'red'
				511
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	512	If there are multiple modes, return the first one encountered.
				513
				514	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				515	'red'
				516
				517	If data is empty, ``mode``, raises StatisticsError.
				518
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	519	"""
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	520	data = iter(data)
				521	try:
				522	return Counter(data).most_common(1)[0][0]
				523	except IndexError:
				524	raise StatisticsError('no mode for empty data') from None
				525
				526
				527	def multimode(data):
				528	""" Return a list of the most frequently occurring values.
				529
				530	Will return more than one result if there are multiple modes
				531	or an empty list if data is empty.
				532
				533	>>> multimode('aabbbbbbbbcc')
				534	['b']
				535	>>> multimode('aabbbbccddddeeffffgg')
				536	['b', 'd', 'f']
				537	>>> multimode('')
				538	[]
				539
				540	"""
				541	counts = Counter(iter(data)).most_common()
				542	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				543	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	544
				545
				546	# === Measures of spread ===
				547
				548	# See http://mathworld.wolfram.com/Variance.html
				549	# http://mathworld.wolfram.com/SampleVariance.html
				550	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				551	#
				552	# Under no circumstances use the so-called "computational formula for
				553	# variance", as that is only suitable for hand calculations with a small
				554	# amount of low-precision data. It has terrible numeric properties.
				555	#
				556	# See a comparison of three computational methods here:
				557	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				558
				559	def _ss(data, c=None):
				560	"""Return sum of square deviations of sequence data.
				561
				562	If ``c`` is None, the mean is calculated in one pass, and the deviations
				563	from the mean are calculated in a second pass. Otherwise, deviations are
				564	calculated from ``c`` as given. Use the second case with care, as it can
				565	lead to garbage results.
				566	"""
				567	if c is None:
				568	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	569	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	570	# The following sum should mathematically equal zero, but due to rounding
				571	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	572	U, total2, count2 = _sum((x-c) for x in data)
				573	assert T == U and count == count2
				574	total -= total2**2/len(data)
				575	assert not total < 0, 'negative sum of square deviations: %f' % total
				576	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	577
				578
				579	def variance(data, xbar=None):
				580	"""Return the sample variance of data.
				581
				582	data should be an iterable of Real-valued numbers, with at least two
				583	values. The optional argument xbar, if given, should be the mean of
				584	the data. If it is missing or None, the mean is automatically calculated.
				585
				586	Use this function when your data is a sample from a population. To
				587	calculate the variance from the entire population, see ``pvariance``.
				588
				589	Examples:
				590
				591	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				592	>>> variance(data)
				593	1.3720238095238095
				594
				595	If you have already calculated the mean of your data, you can pass it as
				596	the optional second argument ``xbar`` to avoid recalculating it:
				597
				598	>>> m = mean(data)
				599	>>> variance(data, m)
				600	1.3720238095238095
				601
				602	This function does not check that ``xbar`` is actually the mean of
				603	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				604	impossible results.
				605
				606	Decimals and Fractions are supported:
				607
				608	>>> from decimal import Decimal as D
				609	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				610	Decimal('31.01875')
				611
				612	>>> from fractions import Fraction as F
				613	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				614	Fraction(67, 108)
				615
				616	"""
				617	if iter(data) is data:
				618	data = list(data)
				619	n = len(data)
				620	if n < 2:
				621	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	622	T, ss = _ss(data, xbar)
				623	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	624
				625
				626	def pvariance(data, mu=None):
				627	"""Return the population variance of ``data``.
				628
				629	data should be an iterable of Real-valued numbers, with at least one
				630	value. The optional argument mu, if given, should be the mean of
				631	the data. If it is missing or None, the mean is automatically calculated.
				632
				633	Use this function to calculate the variance from the entire population.
				634	To estimate the variance from a sample, the ``variance`` function is
				635	usually a better choice.
				636
				637	Examples:
				638
				639	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				640	>>> pvariance(data)
				641	1.25
				642
				643	If you have already calculated the mean of the data, you can pass it as
				644	the optional second argument to avoid recalculating it:
				645
				646	>>> mu = mean(data)
				647	>>> pvariance(data, mu)
				648	1.25
				649
				650	This function does not check that ``mu`` is actually the mean of ``data``.
				651	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				652	results.
				653
				654	Decimals and Fractions are supported:
				655
				656	>>> from decimal import Decimal as D
				657	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				658	Decimal('24.815')
				659
				660	>>> from fractions import Fraction as F
				661	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				662	Fraction(13, 72)
				663
				664	"""
				665	if iter(data) is data:
				666	data = list(data)
				667	n = len(data)
				668	if n < 1:
				669	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	670	T, ss = _ss(data, mu)
				671	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	672
				673
				674	def stdev(data, xbar=None):
				675	"""Return the square root of the sample variance.
				676
				677	See ``variance`` for arguments and other details.
				678
				679	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				680	1.0810874155219827
				681
				682	"""
				683	var = variance(data, xbar)
				684	try:
				685	return var.sqrt()
				686	except AttributeError:
				687	return math.sqrt(var)
				688
				689
				690	def pstdev(data, mu=None):
				691	"""Return the square root of the population variance.
				692
				693	See ``pvariance`` for arguments and other details.
				694
				695	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				696	0.986893273527251
				697
				698	"""
				699	var = pvariance(data, mu)
				700	try:
				701	return var.sqrt()
				702	except AttributeError:
				703	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	704
				705	## Normal Distribution #####################################################
				706
				707	class NormalDist:
				708	'Normal distribution of a random variable'
				709	# https://en.wikipedia.org/wiki/Normal_distribution
				710	# https://en.wikipedia.org/wiki/Variance#Properties
				711
Raymond Hettinger	d1e768a	2019-03-25 13:01:13 -0700	[diff] [blame^]	712	__slots__ = {'mu': 'Arithmetic mean of a normal distribution',
				713	'sigma': 'Standard deviation of a normal distribution'}
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	714
				715	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	716	'NormalDist where mu is the mean and sigma is the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	717	if sigma < 0.0:
				718	raise StatisticsError('sigma must be non-negative')
				719	self.mu = mu
				720	self.sigma = sigma
				721
				722	@classmethod
				723	def from_samples(cls, data):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	724	'Make a normal distribution instance from sample data.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	725	if not isinstance(data, (list, tuple)):
				726	data = list(data)
				727	xbar = fmean(data)
				728	return cls(xbar, stdev(data, xbar))
				729
				730	def samples(self, n, seed=None):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	731	'Generate n samples for a given mean and standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	732	gauss = random.gauss if seed is None else random.Random(seed).gauss
				733	mu, sigma = self.mu, self.sigma
				734	return [gauss(mu, sigma) for i in range(n)]
				735
				736	def pdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	737	'Probability density function. P(x <= X < x+dx) / dx'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	738	variance = self.sigma ** 2.0
				739	if not variance:
				740	raise StatisticsError('pdf() not defined when sigma is zero')
				741	return exp((x - self.mu)*2.0 / (-2.0variance)) / sqrt(tau * variance)
				742
				743	def cdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	744	'Cumulative distribution function. P(X <= x)'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	745	if not self.sigma:
				746	raise StatisticsError('cdf() not defined when sigma is zero')
				747	return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
				748
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	749	def inv_cdf(self, p):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	750	'''Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	751
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	752	Finds the value of the random variable such that the probability of the
				753	variable being less than or equal to that value equals the given probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	754
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	755	This function is also called the percent point function or quantile function.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	756	'''
				757	if (p <= 0.0 or p >= 1.0):
				758	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
				759	if self.sigma <= 0.0:
				760	raise StatisticsError('cdf() not defined when sigma at or below zero')
				761
				762	# There is no closed-form solution to the inverse CDF for the normal
				763	# distribution, so we use a rational approximation instead:
				764	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				765	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				766	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				767
				768	q = p - 0.5
				769	if fabs(q) <= 0.425:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	770	r = 0.180625 - q * q
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	771	num = (((((((2.50908_09287_30122_6727e+3 * r +
				772	3.34305_75583_58812_8105e+4) * r +
				773	6.72657_70927_00870_0853e+4) * r +
				774	4.59219_53931_54987_1457e+4) * r +
				775	1.37316_93765_50946_1125e+4) * r +
				776	1.97159_09503_06551_4427e+3) * r +
				777	1.33141_66789_17843_7745e+2) * r +
				778	3.38713_28727_96366_6080e+0) * q
				779	den = (((((((5.22649_52788_52854_5610e+3 * r +
				780	2.87290_85735_72194_2674e+4) * r +
				781	3.93078_95800_09271_0610e+4) * r +
				782	2.12137_94301_58659_5867e+4) * r +
				783	5.39419_60214_24751_1077e+3) * r +
				784	6.87187_00749_20579_0830e+2) * r +
				785	4.23133_30701_60091_1252e+1) * r +
				786	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	787	x = num / den
				788	return self.mu + (x * self.sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	789	r = p if q <= 0.0 else 1.0 - p
				790	r = sqrt(-log(r))
				791	if r <= 5.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	792	r = r - 1.6
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	793	num = (((((((7.74545_01427_83414_07640e-4 * r +
				794	2.27238_44989_26918_45833e-2) * r +
				795	2.41780_72517_74506_11770e-1) * r +
				796	1.27045_82524_52368_38258e+0) * r +
				797	3.64784_83247_63204_60504e+0) * r +
				798	5.76949_72214_60691_40550e+0) * r +
				799	4.63033_78461_56545_29590e+0) * r +
				800	1.42343_71107_49683_57734e+0)
				801	den = (((((((1.05075_00716_44416_84324e-9 * r +
				802	5.47593_80849_95344_94600e-4) * r +
				803	1.51986_66563_61645_71966e-2) * r +
				804	1.48103_97642_74800_74590e-1) * r +
				805	6.89767_33498_51000_04550e-1) * r +
				806	1.67638_48301_83803_84940e+0) * r +
				807	2.05319_16266_37758_82187e+0) * r +
				808	1.0)
Raymond Hettinger	52a594b	2019-03-19 12:48:04 -0700	[diff] [blame]	809	else:
				810	r = r - 5.0
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame]	811	num = (((((((2.01033_43992_92288_13265e-7 * r +
				812	2.71155_55687_43487_57815e-5) * r +
				813	1.24266_09473_88078_43860e-3) * r +
				814	2.65321_89526_57612_30930e-2) * r +
				815	2.96560_57182_85048_91230e-1) * r +
				816	1.78482_65399_17291_33580e+0) * r +
				817	5.46378_49111_64114_36990e+0) * r +
				818	6.65790_46435_01103_77720e+0)
				819	den = (((((((2.04426_31033_89939_78564e-15 * r +
				820	1.42151_17583_16445_88870e-7) * r +
				821	1.84631_83175_10054_68180e-5) * r +
				822	7.86869_13114_56132_59100e-4) * r +
				823	1.48753_61290_85061_48525e-2) * r +
				824	1.36929_88092_27358_05310e-1) * r +
				825	5.99832_20655_58879_37690e-1) * r +
				826	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	827	x = num / den
				828	if q < 0.0:
				829	x = -x
				830	return self.mu + (x * self.sigma)
				831
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	832	def overlap(self, other):
				833	'''Compute the overlapping coefficient (OVL) between two normal distributions.
				834
				835	Measures the agreement between two normal probability distributions.
				836	Returns a value between 0.0 and 1.0 giving the overlapping area in
				837	the two underlying probability density functions.
				838
				839	>>> N1 = NormalDist(2.4, 1.6)
				840	>>> N2 = NormalDist(3.2, 2.0)
				841	>>> N1.overlap(N2)
				842	0.8035050657330205
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	843	'''
				844	# See: "The overlapping coefficient as a measure of agreement between
				845	# probability distributions and point estimation of the overlap of two
				846	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				847	# http://dx.doi.org/10.1080/03610928908830127
				848	if not isinstance(other, NormalDist):
				849	raise TypeError('Expected another NormalDist instance')
				850	X, Y = self, other
				851	if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
				852	X, Y = Y, X
				853	X_var, Y_var = X.variance, Y.variance
				854	if not X_var or not Y_var:
				855	raise StatisticsError('overlap() not defined when sigma is zero')
				856	dv = Y_var - X_var
				857	dm = fabs(Y.mu - X.mu)
				858	if not dv:
Raymond Hettinger	41f0b78	2019-03-14 02:25:26 -0700	[diff] [blame]	859	return 1.0 - erf(dm / (2.0 * X.sigma * sqrt(2.0)))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	860	a = X.mu * Y_var - Y.mu * X_var
				861	b = X.sigma * Y.sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
				862	x1 = (a + b) / dv
				863	x2 = (a - b) / dv
				864	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				865
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	866	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	867	def mean(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	868	'Arithmetic mean of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	869	return self.mu
				870
				871	@property
				872	def stdev(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	873	'Standard deviation of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	874	return self.sigma
				875
				876	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	877	def variance(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	878	'Square of the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	879	return self.sigma ** 2.0
				880
				881	def __add__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	882	'''Add a constant or another NormalDist instance.
				883
				884	If other is a constant, translate mu by the constant,
				885	leaving sigma unchanged.
				886
				887	If other is a NormalDist, add both the means and the variances.
				888	Mathematically, this works only if the two distributions are
				889	independent or if they are jointly normally distributed.
				890	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	891	if isinstance(x2, NormalDist):
				892	return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
				893	return NormalDist(x1.mu + x2, x1.sigma)
				894
				895	def __sub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	896	'''Subtract a constant or another NormalDist instance.
				897
				898	If other is a constant, translate by the constant mu,
				899	leaving sigma unchanged.
				900
				901	If other is a NormalDist, subtract the means and add the variances.
				902	Mathematically, this works only if the two distributions are
				903	independent or if they are jointly normally distributed.
				904	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	905	if isinstance(x2, NormalDist):
				906	return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
				907	return NormalDist(x1.mu - x2, x1.sigma)
				908
				909	def __mul__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	910	'''Multiply both mu and sigma by a constant.
				911
				912	Used for rescaling, perhaps to change measurement units.
				913	Sigma is scaled with the absolute value of the constant.
				914	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	915	return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
				916
				917	def __truediv__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	918	'''Divide both mu and sigma by a constant.
				919
				920	Used for rescaling, perhaps to change measurement units.
				921	Sigma is scaled with the absolute value of the constant.
				922	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	923	return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
				924
				925	def __pos__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	926	'Return a copy of the instance.'
Raymond Hettinger	79fbcc5	2019-02-23 22:19:01 -0800	[diff] [blame]	927	return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	928
				929	def __neg__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	930	'Negates mu while keeping sigma the same.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	931	return NormalDist(-x1.mu, x1.sigma)
				932
				933	__radd__ = __add__
				934
				935	def __rsub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	936	'Subtract a NormalDist from a constant or another NormalDist.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	937	return -(x1 - x2)
				938
				939	__rmul__ = __mul__
				940
				941	def __eq__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	942	'Two NormalDist objects are equal if their mu and sigma are both equal.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	943	if not isinstance(x2, NormalDist):
				944	return NotImplemented
				945	return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
				946
				947	def __repr__(self):
				948	return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
				949
				950
				951	if __name__ == '__main__':
				952
				953	# Show math operations computed analytically in comparsion
				954	# to a monte carlo simulation of the same operations
				955
				956	from math import isclose
				957	from operator import add, sub, mul, truediv
				958	from itertools import repeat
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	959	import doctest
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	960
				961	g1 = NormalDist(10, 20)
				962	g2 = NormalDist(-5, 25)
				963
				964	# Test scaling by a constant
				965	assert (g1 * 5 / 5).mu == g1.mu
				966	assert (g1 * 5 / 5).sigma == g1.sigma
				967
				968	n = 100_000
				969	G1 = g1.samples(n)
				970	G2 = g2.samples(n)
				971
				972	for func in (add, sub):
				973	print(f'\nTest {func.__name__} with another NormalDist:')
				974	print(func(g1, g2))
				975	print(NormalDist.from_samples(map(func, G1, G2)))
				976
				977	const = 11
				978	for func in (add, sub, mul, truediv):
				979	print(f'\nTest {func.__name__} with a constant:')
				980	print(func(g1, const))
				981	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				982
				983	const = 19
				984	for func in (add, sub, mul):
				985	print(f'\nTest constant with {func.__name__}:')
				986	print(func(const, g1))
				987	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				988
				989	def assert_close(G1, G2):
				990	assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
				991	assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
				992
				993	X = NormalDist(-105, 73)
				994	Y = NormalDist(31, 47)
				995	s = 32.75
				996	n = 100_000
				997
				998	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				999	assert_close(X + s, S)
				1000
				1001	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				1002	assert_close(X - s, S)
				1003
				1004	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				1005	assert_close(X * s, S)
				1006
				1007	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				1008	assert_close(X / s, S)
				1009
				1010	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				1011	Y.samples(n))])
				1012	assert_close(X + Y, S)
				1013
				1014	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				1015	Y.samples(n))])
				1016	assert_close(X - Y, S)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1017
				1018	print(doctest.testmod())