Blame - Lib/statistics.py - platform/external/python/cpython3

blob: e5a62463f0154e701266f2001b0e72e3f7da8b0b [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
				10	================== =============================================
				11	Function Description
				12	================== =============================================
				13	mean Arithmetic mean (average) of data.
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	14	harmonic_mean Harmonic mean of data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	15	median Median (middle value) of data.
				16	median_low Low median of data.
				17	median_high High median of data.
				18	median_grouped Median, or 50th percentile, of grouped data.
				19	mode Mode (most common value) of data.
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	20	multimode List of modes (most common values of data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	21	================== =============================================
				22
				23	Calculate the arithmetic mean ("the average") of data:
				24
				25	>>> mean([-1.0, 2.5, 3.25, 5.75])
				26	2.625
				27
				28
				29	Calculate the standard median of discrete data:
				30
				31	>>> median([2, 3, 4, 5])
				32	3.5
				33
				34
				35	Calculate the median, or 50th percentile, of data grouped into class intervals
				36	centred on the data values provided. E.g. if your data points are rounded to
				37	the nearest whole number:
				38
				39	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				40	2.8333333333...
				41
				42	This should be interpreted in this way: you have two data points in the class
				43	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				44	the class interval 3.5-4.5. The median of these data points is 2.8333...
				45
				46
				47	Calculating variability or spread
				48	---------------------------------
				49
				50	================== =============================================
				51	Function Description
				52	================== =============================================
				53	pvariance Population variance of data.
				54	variance Sample variance of data.
				55	pstdev Population standard deviation of data.
				56	stdev Sample standard deviation of data.
				57	================== =============================================
				58
				59	Calculate the standard deviation of sample data:
				60
				61	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				62	4.38961843444...
				63
				64	If you have previously calculated the mean, you can pass it as the optional
				65	second argument to the four "spread" functions to avoid recalculating it:
				66
				67	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				68	>>> mu = mean(data)
				69	>>> pvariance(data, mu)
				70	2.5
				71
				72
				73	Exceptions
				74	----------
				75
				76	A single exception is defined: StatisticsError is a subclass of ValueError.
				77
				78	"""
				79
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	80	__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	81	'pstdev', 'pvariance', 'stdev', 'variance',
				82	'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	83	'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	84	]
				85
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	86	import math
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	87	import numbers
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	88	import random
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	89
				90	from fractions import Fraction
				91	from decimal import Decimal
Victor Stinner	d6debb2	2017-03-27 16:05:26 +0200	[diff] [blame]	92	from itertools import groupby
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	93	from bisect import bisect_left, bisect_right
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	94	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	95	from operator import itemgetter
				96	from collections import Counter
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	97
				98	# === Exceptions ===
				99
				100	class StatisticsError(ValueError):
				101	pass
				102
				103
				104	# === Private utilities ===
				105
				106	def _sum(data, start=0):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	107	"""_sum(data [, start]) -> (type, sum, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	108
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	109	Return a high-precision sum of the given numeric data as a fraction,
				110	together with the type to be converted to and the count of items.
				111
				112	If optional argument ``start`` is given, it is added to the total.
				113	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	114
				115
				116	Examples
				117	--------
				118
				119	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	120	(<class 'float'>, Fraction(11, 1), 5)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	121
				122	Some sources of round-off error will be avoided:
				123
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	124	# Built-in sum returns zero.
				125	>>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	126	(<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	127
				128	Fractions and Decimals are also supported:
				129
				130	>>> from fractions import Fraction as F
				131	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	132	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	133
				134	>>> from decimal import Decimal as D
				135	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				136	>>> _sum(data)
Benjamin Peterson	ab078e9	2016-07-13 21:13:29 -0700	[diff] [blame]	137	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	138
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	139	Mixed types are currently treated as an error, except that int is
				140	allowed.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	141	"""
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	142	count = 0
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	143	n, d = _exact_ratio(start)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	144	partials = {d: n}
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	145	partials_get = partials.get
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	146	T = _coerce(int, type(start))
				147	for typ, values in groupby(data, type):
				148	T = _coerce(T, typ) # or raise TypeError
				149	for n,d in map(_exact_ratio, values):
				150	count += 1
				151	partials[d] = partials_get(d, 0) + n
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	152	if None in partials:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	153	# The sum will be a NAN or INF. We can ignore all the finite
				154	# partials, and just look at this special one.
				155	total = partials[None]
				156	assert not _isfinite(total)
				157	else:
				158	# Sum all the partial sums using builtin sum.
				159	# FIXME is this faster if we sum them in order of the denominator?
				160	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				161	return (T, total, count)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	162
				163
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	164	def _isfinite(x):
				165	try:
				166	return x.is_finite() # Likely a Decimal.
				167	except AttributeError:
				168	return math.isfinite(x) # Coerces to float first.
				169
				170
				171	def _coerce(T, S):
				172	"""Coerce types T and S to a common type, or raise TypeError.
				173
				174	Coercion rules are currently an implementation detail. See the CoerceTest
				175	test class in test_statistics for details.
				176	"""
				177	# See http://bugs.python.org/issue24068.
				178	assert T is not bool, "initial type T is bool"
				179	# If the types are the same, no need to coerce anything. Put this
				180	# first, so that the usual case (no coercion needed) happens as soon
				181	# as possible.
				182	if T is S: return T
				183	# Mixed int & other coerce to the other type.
				184	if S is int or S is bool: return T
				185	if T is int: return S
				186	# If one is a (strict) subclass of the other, coerce to the subclass.
				187	if issubclass(S, T): return S
				188	if issubclass(T, S): return T
				189	# Ints coerce to the other type.
				190	if issubclass(T, int): return S
				191	if issubclass(S, int): return T
				192	# Mixed fraction & float coerces to float (or float subclass).
				193	if issubclass(T, Fraction) and issubclass(S, float):
				194	return S
				195	if issubclass(T, float) and issubclass(S, Fraction):
				196	return T
				197	# Any other combination is disallowed.
				198	msg = "don't know how to coerce %s and %s"
				199	raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan	73afe2a	2014-02-08 19:58:04 +1000	[diff] [blame]	200
				201
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	202	def _exact_ratio(x):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	203	"""Return Real number x to exact (numerator, denominator) pair.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	204
				205	>>> _exact_ratio(0.25)
				206	(1, 4)
				207
				208	x is expected to be an int, Fraction, Decimal or float.
				209	"""
				210	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	211	# Optimise the common case of floats. We expect that the most often
				212	# used numeric type will be builtin floats, so try to make this as
				213	# fast as possible.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	214	if type(x) is float or type(x) is Decimal:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	215	return x.as_integer_ratio()
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	216	try:
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	217	# x may be an int, Fraction, or Integral ABC.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	218	return (x.numerator, x.denominator)
				219	except AttributeError:
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	220	try:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	221	# x may be a float or Decimal subclass.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	222	return x.as_integer_ratio()
				223	except AttributeError:
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	224	# Just give up?
				225	pass
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	226	except (OverflowError, ValueError):
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	227	# float NAN or INF.
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	228	assert not _isfinite(x)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	229	return (x, None)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	230	msg = "can't convert type '{}' to numerator/denominator"
				231	raise TypeError(msg.format(type(x).__name__))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	232
				233
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	234	def _convert(value, T):
				235	"""Convert value to given numeric type T."""
				236	if type(value) is T:
				237	# This covers the cases where T is Fraction, or where value is
				238	# a NAN or INF (Decimal or float).
				239	return value
				240	if issubclass(T, int) and value.denominator != 1:
				241	T = float
				242	try:
				243	# FIXME: what do we do if this overflows?
				244	return T(value)
				245	except TypeError:
				246	if issubclass(T, Decimal):
				247	return T(value.numerator)/T(value.denominator)
				248	else:
				249	raise
				250
				251
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	252	def _find_lteq(a, x):
				253	'Locate the leftmost value exactly equal to x'
				254	i = bisect_left(a, x)
				255	if i != len(a) and a[i] == x:
				256	return i
				257	raise ValueError
				258
				259
				260	def _find_rteq(a, l, x):
				261	'Locate the rightmost value exactly equal to x'
				262	i = bisect_right(a, x, lo=l)
				263	if i != (len(a)+1) and a[i-1] == x:
				264	return i-1
				265	raise ValueError
				266
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	267
				268	def _fail_neg(values, errmsg='negative value'):
				269	"""Iterate over values, failing if any are less than zero."""
				270	for x in values:
				271	if x < 0:
				272	raise StatisticsError(errmsg)
				273	yield x
				274
				275
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	276	# === Measures of central tendency (averages) ===
				277
				278	def mean(data):
				279	"""Return the sample arithmetic mean of data.
				280
				281	>>> mean([1, 2, 3, 4, 4])
				282	2.8
				283
				284	>>> from fractions import Fraction as F
				285	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				286	Fraction(13, 21)
				287
				288	>>> from decimal import Decimal as D
				289	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				290	Decimal('0.5625')
				291
				292	If ``data`` is empty, StatisticsError will be raised.
				293	"""
				294	if iter(data) is data:
				295	data = list(data)
				296	n = len(data)
				297	if n < 1:
				298	raise StatisticsError('mean requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	299	T, total, count = _sum(data)
				300	assert count == n
				301	return _convert(total/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	302
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	303	def fmean(data):
				304	""" Convert data to floats and compute the arithmetic mean.
				305
				306	This runs faster than the mean() function and it always returns a float.
				307	The result is highly accurate but not as perfect as mean().
				308	If the input dataset is empty, it raises a StatisticsError.
				309
				310	>>> fmean([3.5, 4.0, 5.25])
				311	4.25
				312
				313	"""
				314	try:
				315	n = len(data)
				316	except TypeError:
				317	# Handle iterators that do not define __len__().
				318	n = 0
				319	def count(x):
				320	nonlocal n
				321	n += 1
				322	return x
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	323	total = fsum(map(count, data))
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	324	else:
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	325	total = fsum(data)
Raymond Hettinger	47d9987	2019-02-21 15:06:29 -0800	[diff] [blame]	326	try:
				327	return total / n
				328	except ZeroDivisionError:
				329	raise StatisticsError('fmean requires at least one data point') from None
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	330
Steven D'Aprano	a474afd	2016-08-09 12:49:01 +1000	[diff] [blame]	331	def harmonic_mean(data):
				332	"""Return the harmonic mean of data.
				333
				334	The harmonic mean, sometimes called the subcontrary mean, is the
				335	reciprocal of the arithmetic mean of the reciprocals of the data,
				336	and is often appropriate when averaging quantities which are rates
				337	or ratios, for example speeds. Example:
				338
				339	Suppose an investor purchases an equal value of shares in each of
				340	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				341	What is the average P/E ratio for the investor's portfolio?
				342
				343	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				344	3.6
				345
				346	Using the arithmetic mean would give an average of about 5.167, which
				347	is too high.
				348
				349	If ``data`` is empty, or any element is less than zero,
				350	``harmonic_mean`` will raise ``StatisticsError``.
				351	"""
				352	# For a justification for using harmonic mean for P/E ratios, see
				353	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				354	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				355	if iter(data) is data:
				356	data = list(data)
				357	errmsg = 'harmonic mean does not support negative values'
				358	n = len(data)
				359	if n < 1:
				360	raise StatisticsError('harmonic_mean requires at least one data point')
				361	elif n == 1:
				362	x = data[0]
				363	if isinstance(x, (numbers.Real, Decimal)):
				364	if x < 0:
				365	raise StatisticsError(errmsg)
				366	return x
				367	else:
				368	raise TypeError('unsupported type')
				369	try:
				370	T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
				371	except ZeroDivisionError:
				372	return 0
				373	assert count == n
				374	return _convert(n/total, T)
				375
				376
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	377	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				378	def median(data):
				379	"""Return the median (middle value) of numeric data.
				380
				381	When the number of data points is odd, return the middle data point.
				382	When the number of data points is even, the median is interpolated by
				383	taking the average of the two middle values:
				384
				385	>>> median([1, 3, 5])
				386	3
				387	>>> median([1, 3, 5, 7])
				388	4.0
				389
				390	"""
				391	data = sorted(data)
				392	n = len(data)
				393	if n == 0:
				394	raise StatisticsError("no median for empty data")
				395	if n%2 == 1:
				396	return data[n//2]
				397	else:
				398	i = n//2
				399	return (data[i - 1] + data[i])/2
				400
				401
				402	def median_low(data):
				403	"""Return the low median of numeric data.
				404
				405	When the number of data points is odd, the middle value is returned.
				406	When it is even, the smaller of the two middle values is returned.
				407
				408	>>> median_low([1, 3, 5])
				409	3
				410	>>> median_low([1, 3, 5, 7])
				411	3
				412
				413	"""
				414	data = sorted(data)
				415	n = len(data)
				416	if n == 0:
				417	raise StatisticsError("no median for empty data")
				418	if n%2 == 1:
				419	return data[n//2]
				420	else:
				421	return data[n//2 - 1]
				422
				423
				424	def median_high(data):
				425	"""Return the high median of data.
				426
				427	When the number of data points is odd, the middle value is returned.
				428	When it is even, the larger of the two middle values is returned.
				429
				430	>>> median_high([1, 3, 5])
				431	3
				432	>>> median_high([1, 3, 5, 7])
				433	5
				434
				435	"""
				436	data = sorted(data)
				437	n = len(data)
				438	if n == 0:
				439	raise StatisticsError("no median for empty data")
				440	return data[n//2]
				441
				442
				443	def median_grouped(data, interval=1):
Zachary Ware	df2660e	2015-10-27 22:00:41 -0500	[diff] [blame]	444	"""Return the 50th percentile (median) of grouped continuous data.
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	445
				446	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				447	3.7
				448	>>> median_grouped([52, 52, 53, 54])
				449	52.5
				450
				451	This calculates the median as the 50th percentile, and should be
				452	used when your data is continuous and grouped. In the above example,
				453	the values 1, 2, 3, etc. actually represent the midpoint of classes
				454	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				455	class 3.5-4.5, and interpolation is used to estimate it.
				456
				457	Optional argument ``interval`` represents the class interval, and
				458	defaults to 1. Changing the class interval naturally will change the
				459	interpolated 50th percentile value:
				460
				461	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				462	3.25
				463	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				464	3.5
				465
				466	This function does not check whether the data points are at least
				467	``interval`` apart.
				468	"""
				469	data = sorted(data)
				470	n = len(data)
				471	if n == 0:
				472	raise StatisticsError("no median for empty data")
				473	elif n == 1:
				474	return data[0]
				475	# Find the value at the midpoint. Remember this corresponds to the
				476	# centre of the class interval.
				477	x = data[n//2]
				478	for obj in (x, interval):
				479	if isinstance(obj, (str, bytes)):
				480	raise TypeError('expected number but got %r' % obj)
				481	try:
				482	L = x - interval/2 # The lower limit of the median interval.
				483	except TypeError:
				484	# Mixed type. For now we just coerce to float.
				485	L = float(x) - float(interval)/2
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	486
				487	# Uses bisection search to search for x in data with log(n) time complexity
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	488	# Find the position of leftmost occurrence of x in data
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	489	l1 = _find_lteq(data, x)
Martin Panter	f157982	2016-05-26 06:03:33 +0000	[diff] [blame]	490	# Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano	3b06e24	2016-05-05 03:54:29 +1000	[diff] [blame]	491	# Assuming always l1 <= l2
				492	l2 = _find_rteq(data, l1, x)
				493	cf = l1
				494	f = l2 - l1 + 1
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	495	return L + interval*(n/2 - cf)/f
				496
				497
				498	def mode(data):
				499	"""Return the most common data point from discrete or nominal data.
				500
				501	``mode`` assumes discrete data, and returns a single value. This is the
				502	standard treatment of the mode as commonly taught in schools:
				503
				504	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				505	3
				506
				507	This also works with nominal (non-numeric) data:
				508
				509	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				510	'red'
				511
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	512	If there are multiple modes, return the first one encountered.
				513
				514	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				515	'red'
				516
				517	If data is empty, ``mode``, raises StatisticsError.
				518
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	519	"""
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	520	data = iter(data)
				521	try:
				522	return Counter(data).most_common(1)[0][0]
				523	except IndexError:
				524	raise StatisticsError('no mode for empty data') from None
				525
				526
				527	def multimode(data):
				528	""" Return a list of the most frequently occurring values.
				529
				530	Will return more than one result if there are multiple modes
				531	or an empty list if data is empty.
				532
				533	>>> multimode('aabbbbbbbbcc')
				534	['b']
				535	>>> multimode('aabbbbccddddeeffffgg')
				536	['b', 'd', 'f']
				537	>>> multimode('')
				538	[]
				539
				540	"""
				541	counts = Counter(iter(data)).most_common()
				542	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				543	return list(map(itemgetter(0), mode_items))
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	544
				545
				546	# === Measures of spread ===
				547
				548	# See http://mathworld.wolfram.com/Variance.html
				549	# http://mathworld.wolfram.com/SampleVariance.html
				550	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				551	#
				552	# Under no circumstances use the so-called "computational formula for
				553	# variance", as that is only suitable for hand calculations with a small
				554	# amount of low-precision data. It has terrible numeric properties.
				555	#
				556	# See a comparison of three computational methods here:
				557	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				558
				559	def _ss(data, c=None):
				560	"""Return sum of square deviations of sequence data.
				561
				562	If ``c`` is None, the mean is calculated in one pass, and the deviations
				563	from the mean are calculated in a second pass. Otherwise, deviations are
				564	calculated from ``c`` as given. Use the second case with care, as it can
				565	lead to garbage results.
				566	"""
				567	if c is None:
				568	c = mean(data)
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	569	T, total, count = _sum((x-c)**2 for x in data)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	570	# The following sum should mathematically equal zero, but due to rounding
				571	# error may not.
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	572	U, total2, count2 = _sum((x-c) for x in data)
				573	assert T == U and count == count2
				574	total -= total2**2/len(data)
				575	assert not total < 0, 'negative sum of square deviations: %f' % total
				576	return (T, total)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	577
				578
				579	def variance(data, xbar=None):
				580	"""Return the sample variance of data.
				581
				582	data should be an iterable of Real-valued numbers, with at least two
				583	values. The optional argument xbar, if given, should be the mean of
				584	the data. If it is missing or None, the mean is automatically calculated.
				585
				586	Use this function when your data is a sample from a population. To
				587	calculate the variance from the entire population, see ``pvariance``.
				588
				589	Examples:
				590
				591	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				592	>>> variance(data)
				593	1.3720238095238095
				594
				595	If you have already calculated the mean of your data, you can pass it as
				596	the optional second argument ``xbar`` to avoid recalculating it:
				597
				598	>>> m = mean(data)
				599	>>> variance(data, m)
				600	1.3720238095238095
				601
				602	This function does not check that ``xbar`` is actually the mean of
				603	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				604	impossible results.
				605
				606	Decimals and Fractions are supported:
				607
				608	>>> from decimal import Decimal as D
				609	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				610	Decimal('31.01875')
				611
				612	>>> from fractions import Fraction as F
				613	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				614	Fraction(67, 108)
				615
				616	"""
				617	if iter(data) is data:
				618	data = list(data)
				619	n = len(data)
				620	if n < 2:
				621	raise StatisticsError('variance requires at least two data points')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	622	T, ss = _ss(data, xbar)
				623	return _convert(ss/(n-1), T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	624
				625
				626	def pvariance(data, mu=None):
				627	"""Return the population variance of ``data``.
				628
				629	data should be an iterable of Real-valued numbers, with at least one
				630	value. The optional argument mu, if given, should be the mean of
				631	the data. If it is missing or None, the mean is automatically calculated.
				632
				633	Use this function to calculate the variance from the entire population.
				634	To estimate the variance from a sample, the ``variance`` function is
				635	usually a better choice.
				636
				637	Examples:
				638
				639	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				640	>>> pvariance(data)
				641	1.25
				642
				643	If you have already calculated the mean of the data, you can pass it as
				644	the optional second argument to avoid recalculating it:
				645
				646	>>> mu = mean(data)
				647	>>> pvariance(data, mu)
				648	1.25
				649
				650	This function does not check that ``mu`` is actually the mean of ``data``.
				651	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				652	results.
				653
				654	Decimals and Fractions are supported:
				655
				656	>>> from decimal import Decimal as D
				657	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				658	Decimal('24.815')
				659
				660	>>> from fractions import Fraction as F
				661	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				662	Fraction(13, 72)
				663
				664	"""
				665	if iter(data) is data:
				666	data = list(data)
				667	n = len(data)
				668	if n < 1:
				669	raise StatisticsError('pvariance requires at least one data point')
Steven D'Aprano	b28c327	2015-12-01 19:59:53 +1100	[diff] [blame]	670	T, ss = _ss(data, mu)
				671	return _convert(ss/n, T)
Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame]	672
				673
				674	def stdev(data, xbar=None):
				675	"""Return the square root of the sample variance.
				676
				677	See ``variance`` for arguments and other details.
				678
				679	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				680	1.0810874155219827
				681
				682	"""
				683	var = variance(data, xbar)
				684	try:
				685	return var.sqrt()
				686	except AttributeError:
				687	return math.sqrt(var)
				688
				689
				690	def pstdev(data, mu=None):
				691	"""Return the square root of the population variance.
				692
				693	See ``pvariance`` for arguments and other details.
				694
				695	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				696	0.986893273527251
				697
				698	"""
				699	var = pvariance(data, mu)
				700	try:
				701	return var.sqrt()
				702	except AttributeError:
				703	return math.sqrt(var)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	704
				705	## Normal Distribution #####################################################
				706
				707	class NormalDist:
				708	'Normal distribution of a random variable'
				709	# https://en.wikipedia.org/wiki/Normal_distribution
				710	# https://en.wikipedia.org/wiki/Variance#Properties
				711
				712	__slots__ = ('mu', 'sigma')
				713
				714	def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	715	'NormalDist where mu is the mean and sigma is the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	716	if sigma < 0.0:
				717	raise StatisticsError('sigma must be non-negative')
				718	self.mu = mu
				719	self.sigma = sigma
				720
				721	@classmethod
				722	def from_samples(cls, data):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	723	'Make a normal distribution instance from sample data.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	724	if not isinstance(data, (list, tuple)):
				725	data = list(data)
				726	xbar = fmean(data)
				727	return cls(xbar, stdev(data, xbar))
				728
				729	def samples(self, n, seed=None):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	730	'Generate n samples for a given mean and standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	731	gauss = random.gauss if seed is None else random.Random(seed).gauss
				732	mu, sigma = self.mu, self.sigma
				733	return [gauss(mu, sigma) for i in range(n)]
				734
				735	def pdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	736	'Probability density function. P(x <= X < x+dx) / dx'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	737	variance = self.sigma ** 2.0
				738	if not variance:
				739	raise StatisticsError('pdf() not defined when sigma is zero')
				740	return exp((x - self.mu)*2.0 / (-2.0variance)) / sqrt(tau * variance)
				741
				742	def cdf(self, x):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	743	'Cumulative distribution function. P(X <= x)'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	744	if not self.sigma:
				745	raise StatisticsError('cdf() not defined when sigma is zero')
				746	return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
				747
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	748	def inv_cdf(self, p):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	749	'''Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	750
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	751	Finds the value of the random variable such that the probability of the
				752	variable being less than or equal to that value equals the given probability.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	753
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	754	This function is also called the percent point function or quantile function.
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	755	'''
				756	if (p <= 0.0 or p >= 1.0):
				757	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
				758	if self.sigma <= 0.0:
				759	raise StatisticsError('cdf() not defined when sigma at or below zero')
				760
				761	# There is no closed-form solution to the inverse CDF for the normal
				762	# distribution, so we use a rational approximation instead:
				763	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				764	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				765	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				766
				767	q = p - 0.5
				768	if fabs(q) <= 0.425:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	769	r = 0.180625 - q * q
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame^]	770	num = (((((((2.50908_09287_30122_6727e+3 * r +
				771	3.34305_75583_58812_8105e+4) * r +
				772	6.72657_70927_00870_0853e+4) * r +
				773	4.59219_53931_54987_1457e+4) * r +
				774	1.37316_93765_50946_1125e+4) * r +
				775	1.97159_09503_06551_4427e+3) * r +
				776	1.33141_66789_17843_7745e+2) * r +
				777	3.38713_28727_96366_6080e+0) * q
				778	den = (((((((5.22649_52788_52854_5610e+3 * r +
				779	2.87290_85735_72194_2674e+4) * r +
				780	3.93078_95800_09271_0610e+4) * r +
				781	2.12137_94301_58659_5867e+4) * r +
				782	5.39419_60214_24751_1077e+3) * r +
				783	6.87187_00749_20579_0830e+2) * r +
				784	4.23133_30701_60091_1252e+1) * r +
				785	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	786	x = num / den
				787	return self.mu + (x * self.sigma)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	788	r = p if q <= 0.0 else 1.0 - p
				789	r = sqrt(-log(r))
				790	if r <= 5.0:
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	791	r = r - 1.6
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame^]	792	num = (((((((7.74545_01427_83414_07640e-4 * r +
				793	2.27238_44989_26918_45833e-2) * r +
				794	2.41780_72517_74506_11770e-1) * r +
				795	1.27045_82524_52368_38258e+0) * r +
				796	3.64784_83247_63204_60504e+0) * r +
				797	5.76949_72214_60691_40550e+0) * r +
				798	4.63033_78461_56545_29590e+0) * r +
				799	1.42343_71107_49683_57734e+0)
				800	den = (((((((1.05075_00716_44416_84324e-9 * r +
				801	5.47593_80849_95344_94600e-4) * r +
				802	1.51986_66563_61645_71966e-2) * r +
				803	1.48103_97642_74800_74590e-1) * r +
				804	6.89767_33498_51000_04550e-1) * r +
				805	1.67638_48301_83803_84940e+0) * r +
				806	2.05319_16266_37758_82187e+0) * r +
				807	1.0)
Raymond Hettinger	52a594b	2019-03-19 12:48:04 -0700	[diff] [blame]	808	else:
				809	r = r - 5.0
Raymond Hettinger	fe13883	2019-03-19 14:29:13 -0700	[diff] [blame^]	810	num = (((((((2.01033_43992_92288_13265e-7 * r +
				811	2.71155_55687_43487_57815e-5) * r +
				812	1.24266_09473_88078_43860e-3) * r +
				813	2.65321_89526_57612_30930e-2) * r +
				814	2.96560_57182_85048_91230e-1) * r +
				815	1.78482_65399_17291_33580e+0) * r +
				816	5.46378_49111_64114_36990e+0) * r +
				817	6.65790_46435_01103_77720e+0)
				818	den = (((((((2.04426_31033_89939_78564e-15 * r +
				819	1.42151_17583_16445_88870e-7) * r +
				820	1.84631_83175_10054_68180e-5) * r +
				821	7.86869_13114_56132_59100e-4) * r +
				822	1.48753_61290_85061_48525e-2) * r +
				823	1.36929_88092_27358_05310e-1) * r +
				824	5.99832_20655_58879_37690e-1) * r +
				825	1.0)
Raymond Hettinger	714c60d	2019-03-18 20:17:14 -0700	[diff] [blame]	826	x = num / den
				827	if q < 0.0:
				828	x = -x
				829	return self.mu + (x * self.sigma)
				830
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	831	def overlap(self, other):
				832	'''Compute the overlapping coefficient (OVL) between two normal distributions.
				833
				834	Measures the agreement between two normal probability distributions.
				835	Returns a value between 0.0 and 1.0 giving the overlapping area in
				836	the two underlying probability density functions.
				837
				838	>>> N1 = NormalDist(2.4, 1.6)
				839	>>> N2 = NormalDist(3.2, 2.0)
				840	>>> N1.overlap(N2)
				841	0.8035050657330205
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	842	'''
				843	# See: "The overlapping coefficient as a measure of agreement between
				844	# probability distributions and point estimation of the overlap of two
				845	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				846	# http://dx.doi.org/10.1080/03610928908830127
				847	if not isinstance(other, NormalDist):
				848	raise TypeError('Expected another NormalDist instance')
				849	X, Y = self, other
				850	if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
				851	X, Y = Y, X
				852	X_var, Y_var = X.variance, Y.variance
				853	if not X_var or not Y_var:
				854	raise StatisticsError('overlap() not defined when sigma is zero')
				855	dv = Y_var - X_var
				856	dm = fabs(Y.mu - X.mu)
				857	if not dv:
Raymond Hettinger	41f0b78	2019-03-14 02:25:26 -0700	[diff] [blame]	858	return 1.0 - erf(dm / (2.0 * X.sigma * sqrt(2.0)))
Raymond Hettinger	318d537	2019-03-06 22:59:40 -0800	[diff] [blame]	859	a = X.mu * Y_var - Y.mu * X_var
				860	b = X.sigma * Y.sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
				861	x1 = (a + b) / dv
				862	x2 = (a - b) / dv
				863	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				864
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	865	@property
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	866	def mean(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	867	'Arithmetic mean of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	868	return self.mu
				869
				870	@property
				871	def stdev(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	872	'Standard deviation of the normal distribution.'
Raymond Hettinger	9e456bc	2019-02-24 11:44:55 -0800	[diff] [blame]	873	return self.sigma
				874
				875	@property
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	876	def variance(self):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	877	'Square of the standard deviation.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	878	return self.sigma ** 2.0
				879
				880	def __add__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	881	'''Add a constant or another NormalDist instance.
				882
				883	If other is a constant, translate mu by the constant,
				884	leaving sigma unchanged.
				885
				886	If other is a NormalDist, add both the means and the variances.
				887	Mathematically, this works only if the two distributions are
				888	independent or if they are jointly normally distributed.
				889	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	890	if isinstance(x2, NormalDist):
				891	return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
				892	return NormalDist(x1.mu + x2, x1.sigma)
				893
				894	def __sub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	895	'''Subtract a constant or another NormalDist instance.
				896
				897	If other is a constant, translate by the constant mu,
				898	leaving sigma unchanged.
				899
				900	If other is a NormalDist, subtract the means and add the variances.
				901	Mathematically, this works only if the two distributions are
				902	independent or if they are jointly normally distributed.
				903	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	904	if isinstance(x2, NormalDist):
				905	return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
				906	return NormalDist(x1.mu - x2, x1.sigma)
				907
				908	def __mul__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	909	'''Multiply both mu and sigma by a constant.
				910
				911	Used for rescaling, perhaps to change measurement units.
				912	Sigma is scaled with the absolute value of the constant.
				913	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	914	return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
				915
				916	def __truediv__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	917	'''Divide both mu and sigma by a constant.
				918
				919	Used for rescaling, perhaps to change measurement units.
				920	Sigma is scaled with the absolute value of the constant.
				921	'''
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	922	return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
				923
				924	def __pos__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	925	'Return a copy of the instance.'
Raymond Hettinger	79fbcc5	2019-02-23 22:19:01 -0800	[diff] [blame]	926	return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	927
				928	def __neg__(x1):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	929	'Negates mu while keeping sigma the same.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	930	return NormalDist(-x1.mu, x1.sigma)
				931
				932	__radd__ = __add__
				933
				934	def __rsub__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	935	'Subtract a NormalDist from a constant or another NormalDist.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	936	return -(x1 - x2)
				937
				938	__rmul__ = __mul__
				939
				940	def __eq__(x1, x2):
Raymond Hettinger	5f1e8b4	2019-03-18 22:24:15 -0700	[diff] [blame]	941	'Two NormalDist objects are equal if their mu and sigma are both equal.'
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	942	if not isinstance(x2, NormalDist):
				943	return NotImplemented
				944	return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
				945
				946	def __repr__(self):
				947	return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
				948
				949
				950	if __name__ == '__main__':
				951
				952	# Show math operations computed analytically in comparsion
				953	# to a monte carlo simulation of the same operations
				954
				955	from math import isclose
				956	from operator import add, sub, mul, truediv
				957	from itertools import repeat
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	958	import doctest
Raymond Hettinger	11c7953	2019-02-23 14:44:07 -0800	[diff] [blame]	959
				960	g1 = NormalDist(10, 20)
				961	g2 = NormalDist(-5, 25)
				962
				963	# Test scaling by a constant
				964	assert (g1 * 5 / 5).mu == g1.mu
				965	assert (g1 * 5 / 5).sigma == g1.sigma
				966
				967	n = 100_000
				968	G1 = g1.samples(n)
				969	G2 = g2.samples(n)
				970
				971	for func in (add, sub):
				972	print(f'\nTest {func.__name__} with another NormalDist:')
				973	print(func(g1, g2))
				974	print(NormalDist.from_samples(map(func, G1, G2)))
				975
				976	const = 11
				977	for func in (add, sub, mul, truediv):
				978	print(f'\nTest {func.__name__} with a constant:')
				979	print(func(g1, const))
				980	print(NormalDist.from_samples(map(func, G1, repeat(const))))
				981
				982	const = 19
				983	for func in (add, sub, mul):
				984	print(f'\nTest constant with {func.__name__}:')
				985	print(func(const, g1))
				986	print(NormalDist.from_samples(map(func, repeat(const), G1)))
				987
				988	def assert_close(G1, G2):
				989	assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
				990	assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
				991
				992	X = NormalDist(-105, 73)
				993	Y = NormalDist(31, 47)
				994	s = 32.75
				995	n = 100_000
				996
				997	S = NormalDist.from_samples([x + s for x in X.samples(n)])
				998	assert_close(X + s, S)
				999
				1000	S = NormalDist.from_samples([x - s for x in X.samples(n)])
				1001	assert_close(X - s, S)
				1002
				1003	S = NormalDist.from_samples([x * s for x in X.samples(n)])
				1004	assert_close(X * s, S)
				1005
				1006	S = NormalDist.from_samples([x / s for x in X.samples(n)])
				1007	assert_close(X / s, S)
				1008
				1009	S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
				1010	Y.samples(n))])
				1011	assert_close(X + Y, S)
				1012
				1013	S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
				1014	Y.samples(n))])
				1015	assert_close(X - Y, S)
Raymond Hettinger	fc06a19	2019-03-12 00:43:27 -0700	[diff] [blame]	1016
				1017	print(doctest.testmod())