blob: 673a162b3a79fc047c8423642006d705e06b0099 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070010================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070011Function Description
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070012================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070013mean Arithmetic mean (average) of data.
Raymond Hettinger72800482019-04-23 01:35:16 -070014fmean Fast, floating point arithmetic mean.
Raymond Hettinger6463ba32019-04-07 09:20:03 -070015geometric_mean Geometric mean of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100016harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070017median Median (middle value) of data.
18median_low Low median of data.
19median_high High median of data.
20median_grouped Median, or 50th percentile, of grouped data.
21mode Mode (most common value) of data.
Raymond Hettinger6463ba32019-04-07 09:20:03 -070022multimode List of modes (most common values of data).
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070023quantiles Divide data into intervals with equal probability.
24================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070025
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53================== =============================================
54Function Description
55================== =============================================
56pvariance Population variance of data.
57variance Sample variance of data.
58pstdev Population standard deviation of data.
59stdev Sample standard deviation of data.
60================== =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +020076Statistics for relations between two inputs
77-------------------------------------------
78
79================== ====================================================
80Function Description
81================== ====================================================
82covariance Sample covariance for two variables.
83correlation Pearson's correlation coefficient for two variables.
84linear_regression Intercept and slope for simple linear regression.
85================== ====================================================
86
87Calculate covariance, Pearson's correlation, and simple linear regression
88for two inputs:
89
90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92>>> covariance(x, y)
930.75
94>>> correlation(x, y) #doctest: +ELLIPSIS
950.31622776601...
96>>> linear_regression(x, y) #doctest:
97LinearRegression(intercept=1.5, slope=0.1)
98
99
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700100Exceptions
101----------
102
103A single exception is defined: StatisticsError is a subclass of ValueError.
104
105"""
106
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700107__all__ = [
108 'NormalDist',
109 'StatisticsError',
110 'fmean',
111 'geometric_mean',
112 'harmonic_mean',
113 'mean',
114 'median',
115 'median_grouped',
116 'median_high',
117 'median_low',
118 'mode',
119 'multimode',
120 'pstdev',
121 'pvariance',
122 'quantiles',
123 'stdev',
124 'variance',
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200125 'correlation',
126 'covariance',
127 'linear_regression',
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700128]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700129
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700130import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000131import numbers
Raymond Hettinger11c79532019-02-23 14:44:07 -0800132import random
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134from fractions import Fraction
135from decimal import Decimal
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800136from itertools import groupby, repeat
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000137from bisect import bisect_left, bisect_right
Raymond Hettinger318d5372019-03-06 22:59:40 -0800138from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700139from operator import itemgetter
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200140from collections import Counter, namedtuple
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141
142# === Exceptions ===
143
144class StatisticsError(ValueError):
145 pass
146
147
148# === Private utilities ===
149
150def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100151 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 Return a high-precision sum of the given numeric data as a fraction,
154 together with the type to be converted to and the count of items.
155
156 If optional argument ``start`` is given, it is added to the total.
157 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700158
159
160 Examples
161 --------
162
163 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700164 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700165
166 Some sources of round-off error will be avoided:
167
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000168 # Built-in sum returns zero.
169 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700170 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700171
172 Fractions and Decimals are also supported:
173
174 >>> from fractions import Fraction as F
175 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700176 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700177
178 >>> from decimal import Decimal as D
179 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
180 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700181 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700182
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000183 Mixed types are currently treated as an error, except that int is
184 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700185 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100186 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700187 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100188 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700189 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100190 T = _coerce(int, type(start))
191 for typ, values in groupby(data, type):
192 T = _coerce(T, typ) # or raise TypeError
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700193 for n, d in map(_exact_ratio, values):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100194 count += 1
195 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700196 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100197 # The sum will be a NAN or INF. We can ignore all the finite
198 # partials, and just look at this special one.
199 total = partials[None]
200 assert not _isfinite(total)
201 else:
202 # Sum all the partial sums using builtin sum.
203 # FIXME is this faster if we sum them in order of the denominator?
204 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
205 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700206
207
Steven D'Apranob28c3272015-12-01 19:59:53 +1100208def _isfinite(x):
209 try:
210 return x.is_finite() # Likely a Decimal.
211 except AttributeError:
212 return math.isfinite(x) # Coerces to float first.
213
214
215def _coerce(T, S):
216 """Coerce types T and S to a common type, or raise TypeError.
217
218 Coercion rules are currently an implementation detail. See the CoerceTest
219 test class in test_statistics for details.
220 """
221 # See http://bugs.python.org/issue24068.
222 assert T is not bool, "initial type T is bool"
223 # If the types are the same, no need to coerce anything. Put this
224 # first, so that the usual case (no coercion needed) happens as soon
225 # as possible.
226 if T is S: return T
227 # Mixed int & other coerce to the other type.
228 if S is int or S is bool: return T
229 if T is int: return S
230 # If one is a (strict) subclass of the other, coerce to the subclass.
231 if issubclass(S, T): return S
232 if issubclass(T, S): return T
233 # Ints coerce to the other type.
234 if issubclass(T, int): return S
235 if issubclass(S, int): return T
236 # Mixed fraction & float coerces to float (or float subclass).
237 if issubclass(T, Fraction) and issubclass(S, float):
238 return S
239 if issubclass(T, float) and issubclass(S, Fraction):
240 return T
241 # Any other combination is disallowed.
242 msg = "don't know how to coerce %s and %s"
243 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000244
245
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700246def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100247 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700248
249 >>> _exact_ratio(0.25)
250 (1, 4)
251
252 x is expected to be an int, Fraction, Decimal or float.
253 """
254 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100255 # Optimise the common case of floats. We expect that the most often
256 # used numeric type will be builtin floats, so try to make this as
257 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000258 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100259 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700260 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100261 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700262 return (x.numerator, x.denominator)
263 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700264 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000265 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700266 return x.as_integer_ratio()
267 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000268 # Just give up?
269 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700270 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100271 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000272 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700273 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100274 msg = "can't convert type '{}' to numerator/denominator"
275 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700276
277
Steven D'Apranob28c3272015-12-01 19:59:53 +1100278def _convert(value, T):
279 """Convert value to given numeric type T."""
280 if type(value) is T:
281 # This covers the cases where T is Fraction, or where value is
282 # a NAN or INF (Decimal or float).
283 return value
284 if issubclass(T, int) and value.denominator != 1:
285 T = float
286 try:
287 # FIXME: what do we do if this overflows?
288 return T(value)
289 except TypeError:
290 if issubclass(T, Decimal):
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700291 return T(value.numerator) / T(value.denominator)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100292 else:
293 raise
294
295
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000296def _find_lteq(a, x):
297 'Locate the leftmost value exactly equal to x'
298 i = bisect_left(a, x)
299 if i != len(a) and a[i] == x:
300 return i
301 raise ValueError
302
303
304def _find_rteq(a, l, x):
305 'Locate the rightmost value exactly equal to x'
306 i = bisect_right(a, x, lo=l)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700307 if i != (len(a) + 1) and a[i - 1] == x:
308 return i - 1
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000309 raise ValueError
310
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000311
312def _fail_neg(values, errmsg='negative value'):
313 """Iterate over values, failing if any are less than zero."""
314 for x in values:
315 if x < 0:
316 raise StatisticsError(errmsg)
317 yield x
318
319
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700320# === Measures of central tendency (averages) ===
321
322def mean(data):
323 """Return the sample arithmetic mean of data.
324
325 >>> mean([1, 2, 3, 4, 4])
326 2.8
327
328 >>> from fractions import Fraction as F
329 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
330 Fraction(13, 21)
331
332 >>> from decimal import Decimal as D
333 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
334 Decimal('0.5625')
335
336 If ``data`` is empty, StatisticsError will be raised.
337 """
338 if iter(data) is data:
339 data = list(data)
340 n = len(data)
341 if n < 1:
342 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100343 T, total, count = _sum(data)
344 assert count == n
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700345 return _convert(total / n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700346
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700347
Raymond Hettinger47d99872019-02-21 15:06:29 -0800348def fmean(data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700349 """Convert data to floats and compute the arithmetic mean.
Raymond Hettinger47d99872019-02-21 15:06:29 -0800350
351 This runs faster than the mean() function and it always returns a float.
Raymond Hettinger47d99872019-02-21 15:06:29 -0800352 If the input dataset is empty, it raises a StatisticsError.
353
354 >>> fmean([3.5, 4.0, 5.25])
355 4.25
Raymond Hettinger47d99872019-02-21 15:06:29 -0800356 """
357 try:
358 n = len(data)
359 except TypeError:
360 # Handle iterators that do not define __len__().
361 n = 0
Raymond Hettinger6c01ebc2019-06-05 07:39:38 -0700362 def count(iterable):
Raymond Hettinger47d99872019-02-21 15:06:29 -0800363 nonlocal n
Raymond Hettinger6c01ebc2019-06-05 07:39:38 -0700364 for n, x in enumerate(iterable, start=1):
365 yield x
366 total = fsum(count(data))
Raymond Hettinger47d99872019-02-21 15:06:29 -0800367 else:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700368 total = fsum(data)
Raymond Hettinger47d99872019-02-21 15:06:29 -0800369 try:
370 return total / n
371 except ZeroDivisionError:
372 raise StatisticsError('fmean requires at least one data point') from None
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700373
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700374
Raymond Hettinger6463ba32019-04-07 09:20:03 -0700375def geometric_mean(data):
376 """Convert data to floats and compute the geometric mean.
377
378 Raises a StatisticsError if the input dataset is empty,
379 if it contains a zero, or if it contains a negative value.
380
381 No special efforts are made to achieve exact results.
382 (However, this may change in the future.)
383
384 >>> round(geometric_mean([54, 24, 36]), 9)
385 36.0
386 """
387 try:
388 return exp(fmean(map(log, data)))
389 except ValueError:
390 raise StatisticsError('geometric mean requires a non-empty dataset '
391 ' containing positive numbers') from None
392
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700393
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800394def harmonic_mean(data, weights=None):
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000395 """Return the harmonic mean of data.
396
Raymond Hettinger30a8b282021-02-07 16:44:42 -0800397 The harmonic mean is the reciprocal of the arithmetic mean of the
398 reciprocals of the data. It can be used for averaging ratios or
399 rates, for example speeds.
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000400
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800401 Suppose a car travels 40 km/hr for 5 km and then speeds-up to
402 60 km/hr for another 5 km. What is the average speed?
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000403
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800404 >>> harmonic_mean([40, 60])
405 48.0
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000406
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800407 Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
408 speeds-up to 60 km/hr for the remaining 30 km of the journey. What
409 is the average speed?
410
411 >>> harmonic_mean([40, 60], weights=[5, 30])
412 56.0
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000413
414 If ``data`` is empty, or any element is less than zero,
415 ``harmonic_mean`` will raise ``StatisticsError``.
416 """
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000417 if iter(data) is data:
418 data = list(data)
419 errmsg = 'harmonic mean does not support negative values'
420 n = len(data)
421 if n < 1:
422 raise StatisticsError('harmonic_mean requires at least one data point')
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800423 elif n == 1 and weights is None:
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000424 x = data[0]
425 if isinstance(x, (numbers.Real, Decimal)):
426 if x < 0:
427 raise StatisticsError(errmsg)
428 return x
429 else:
430 raise TypeError('unsupported type')
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800431 if weights is None:
432 weights = repeat(1, n)
433 sum_weights = n
434 else:
435 if iter(weights) is weights:
436 weights = list(weights)
437 if len(weights) != n:
438 raise StatisticsError('Number of weights does not match data size')
439 _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000440 try:
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800441 data = _fail_neg(data, errmsg)
442 T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000443 except ZeroDivisionError:
444 return 0
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800445 if total <= 0:
446 raise StatisticsError('Weighted sum must be positive')
447 return _convert(sum_weights / total, T)
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000448
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700449# FIXME: investigate ways to calculate medians without sorting? Quickselect?
450def median(data):
451 """Return the median (middle value) of numeric data.
452
453 When the number of data points is odd, return the middle data point.
454 When the number of data points is even, the median is interpolated by
455 taking the average of the two middle values:
456
457 >>> median([1, 3, 5])
458 3
459 >>> median([1, 3, 5, 7])
460 4.0
461
462 """
463 data = sorted(data)
464 n = len(data)
465 if n == 0:
466 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700467 if n % 2 == 1:
468 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700469 else:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700470 i = n // 2
471 return (data[i - 1] + data[i]) / 2
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700472
473
474def median_low(data):
475 """Return the low median of numeric data.
476
477 When the number of data points is odd, the middle value is returned.
478 When it is even, the smaller of the two middle values is returned.
479
480 >>> median_low([1, 3, 5])
481 3
482 >>> median_low([1, 3, 5, 7])
483 3
484
485 """
486 data = sorted(data)
487 n = len(data)
488 if n == 0:
489 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700490 if n % 2 == 1:
491 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700492 else:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700493 return data[n // 2 - 1]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700494
495
496def median_high(data):
497 """Return the high median of data.
498
499 When the number of data points is odd, the middle value is returned.
500 When it is even, the larger of the two middle values is returned.
501
502 >>> median_high([1, 3, 5])
503 3
504 >>> median_high([1, 3, 5, 7])
505 5
506
507 """
508 data = sorted(data)
509 n = len(data)
510 if n == 0:
511 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700512 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700513
514
515def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500516 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700517
518 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
519 3.7
520 >>> median_grouped([52, 52, 53, 54])
521 52.5
522
523 This calculates the median as the 50th percentile, and should be
524 used when your data is continuous and grouped. In the above example,
525 the values 1, 2, 3, etc. actually represent the midpoint of classes
526 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
527 class 3.5-4.5, and interpolation is used to estimate it.
528
529 Optional argument ``interval`` represents the class interval, and
530 defaults to 1. Changing the class interval naturally will change the
531 interpolated 50th percentile value:
532
533 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
534 3.25
535 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
536 3.5
537
538 This function does not check whether the data points are at least
539 ``interval`` apart.
540 """
541 data = sorted(data)
542 n = len(data)
543 if n == 0:
544 raise StatisticsError("no median for empty data")
545 elif n == 1:
546 return data[0]
547 # Find the value at the midpoint. Remember this corresponds to the
548 # centre of the class interval.
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700549 x = data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700550 for obj in (x, interval):
551 if isinstance(obj, (str, bytes)):
552 raise TypeError('expected number but got %r' % obj)
553 try:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700554 L = x - interval / 2 # The lower limit of the median interval.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700555 except TypeError:
556 # Mixed type. For now we just coerce to float.
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700557 L = float(x) - float(interval) / 2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000558
559 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000560 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000561 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000562 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000563 # Assuming always l1 <= l2
564 l2 = _find_rteq(data, l1, x)
565 cf = l1
566 f = l2 - l1 + 1
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700567 return L + interval * (n / 2 - cf) / f
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700568
569
570def mode(data):
571 """Return the most common data point from discrete or nominal data.
572
573 ``mode`` assumes discrete data, and returns a single value. This is the
574 standard treatment of the mode as commonly taught in schools:
575
Raymond Hettingere4810b22019-09-05 00:18:47 -0700576 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
577 3
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700578
579 This also works with nominal (non-numeric) data:
580
Raymond Hettingere4810b22019-09-05 00:18:47 -0700581 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
582 'red'
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700583
Raymond Hettingere4810b22019-09-05 00:18:47 -0700584 If there are multiple modes with same frequency, return the first one
585 encountered:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700586
587 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
588 'red'
589
590 If *data* is empty, ``mode``, raises StatisticsError.
591
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700592 """
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700593 pairs = Counter(iter(data)).most_common(1)
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700594 try:
Raymond Hettinger7ce4bfa2019-09-20 21:46:52 -0700595 return pairs[0][0]
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700596 except IndexError:
597 raise StatisticsError('no mode for empty data') from None
598
599
600def multimode(data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700601 """Return a list of the most frequently occurring values.
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700602
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700603 Will return more than one result if there are multiple modes
604 or an empty list if *data* is empty.
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700605
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700606 >>> multimode('aabbbbbbbbcc')
607 ['b']
608 >>> multimode('aabbbbccddddeeffffgg')
609 ['b', 'd', 'f']
610 >>> multimode('')
611 []
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700612 """
613 counts = Counter(iter(data)).most_common()
614 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
615 return list(map(itemgetter(0), mode_items))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700616
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700617
Raymond Hettingercba9f842019-06-02 21:07:43 -0700618# Notes on methods for computing quantiles
619# ----------------------------------------
620#
621# There is no one perfect way to compute quantiles. Here we offer
622# two methods that serve common needs. Most other packages
623# surveyed offered at least one or both of these two, making them
624# "standard" in the sense of "widely-adopted and reproducible".
625# They are also easy to explain, easy to compute manually, and have
626# straight-forward interpretations that aren't surprising.
627
628# The default method is known as "R6", "PERCENTILE.EXC", or "expected
629# value of rank order statistics". The alternative method is known as
630# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
631
632# For sample data where there is a positive probability for values
633# beyond the range of the data, the R6 exclusive method is a
634# reasonable choice. Consider a random sample of nine values from a
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700635# population with a uniform distribution from 0.0 to 1.0. The
Raymond Hettingercba9f842019-06-02 21:07:43 -0700636# distribution of the third ranked sample point is described by
637# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
638# mean=0.300. Only the latter (which corresponds with R6) gives the
639# desired cut point with 30% of the population falling below that
640# value, making it comparable to a result from an inv_cdf() function.
Raymond Hettinger7ce4bfa2019-09-20 21:46:52 -0700641# The R6 exclusive method is also idempotent.
Raymond Hettingercba9f842019-06-02 21:07:43 -0700642
643# For describing population data where the end points are known to
644# be included in the data, the R7 inclusive method is a reasonable
645# choice. Instead of the mean, it uses the mode of the beta
646# distribution for the interior points. Per Hyndman & Fan, "One nice
647# property is that the vertices of Q7(p) divide the range into n - 1
648# intervals, and exactly 100p% of the intervals lie to the left of
649# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
650
Raymond Hettingereed5e9a2019-07-19 01:57:22 -0700651# If needed, other methods could be added. However, for now, the
652# position is that fewer options make for easier choices and that
653# external packages can be used for anything more advanced.
Raymond Hettingercba9f842019-06-02 21:07:43 -0700654
Raymond Hettinger272d0d02019-09-17 20:45:05 -0700655def quantiles(data, *, n=4, method='exclusive'):
Raymond Hettingere4810b22019-09-05 00:18:47 -0700656 """Divide *data* into *n* continuous intervals with equal probability.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700657
658 Returns a list of (n - 1) cut points separating the intervals.
659
660 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
661 Set *n* to 100 for percentiles which gives the 99 cuts points that
Raymond Hettingere4810b22019-09-05 00:18:47 -0700662 separate *data* in to 100 equal sized groups.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700663
Raymond Hettinger4db25d52019-09-08 16:57:58 -0700664 The *data* can be any iterable containing sample.
665 The cut points are linearly interpolated between data points.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700666
Raymond Hettingere4810b22019-09-05 00:18:47 -0700667 If *method* is set to *inclusive*, *data* is treated as population
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700668 data. The minimum value is treated as the 0th percentile and the
669 maximum value is treated as the 100th percentile.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700670 """
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700671 if n < 1:
672 raise StatisticsError('n must be at least 1')
Raymond Hettingere4810b22019-09-05 00:18:47 -0700673 data = sorted(data)
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700674 ld = len(data)
675 if ld < 2:
676 raise StatisticsError('must have at least two data points')
677 if method == 'inclusive':
678 m = ld - 1
679 result = []
680 for i in range(1, n):
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700681 j, delta = divmod(i * m, n)
682 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700683 result.append(interpolated)
684 return result
685 if method == 'exclusive':
686 m = ld + 1
687 result = []
688 for i in range(1, n):
689 j = i * m // n # rescale i to m/n
690 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
691 delta = i*m - j*n # exact integer math
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700692 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700693 result.append(interpolated)
694 return result
695 raise ValueError(f'Unknown method: {method!r}')
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700696
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700697
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700698# === Measures of spread ===
699
700# See http://mathworld.wolfram.com/Variance.html
701# http://mathworld.wolfram.com/SampleVariance.html
702# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
703#
704# Under no circumstances use the so-called "computational formula for
705# variance", as that is only suitable for hand calculations with a small
706# amount of low-precision data. It has terrible numeric properties.
707#
708# See a comparison of three computational methods here:
709# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
710
711def _ss(data, c=None):
712 """Return sum of square deviations of sequence data.
713
714 If ``c`` is None, the mean is calculated in one pass, and the deviations
715 from the mean are calculated in a second pass. Otherwise, deviations are
716 calculated from ``c`` as given. Use the second case with care, as it can
717 lead to garbage results.
718 """
Raymond Hettingerd71ab4f2020-06-13 15:55:52 -0700719 if c is not None:
720 T, total, count = _sum((x-c)**2 for x in data)
721 return (T, total)
722 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100723 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700724 # The following sum should mathematically equal zero, but due to rounding
725 # error may not.
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700726 U, total2, count2 = _sum((x - c) for x in data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100727 assert T == U and count == count2
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700728 total -= total2 ** 2 / len(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100729 assert not total < 0, 'negative sum of square deviations: %f' % total
730 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700731
732
733def variance(data, xbar=None):
734 """Return the sample variance of data.
735
736 data should be an iterable of Real-valued numbers, with at least two
737 values. The optional argument xbar, if given, should be the mean of
738 the data. If it is missing or None, the mean is automatically calculated.
739
740 Use this function when your data is a sample from a population. To
741 calculate the variance from the entire population, see ``pvariance``.
742
743 Examples:
744
745 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
746 >>> variance(data)
747 1.3720238095238095
748
749 If you have already calculated the mean of your data, you can pass it as
750 the optional second argument ``xbar`` to avoid recalculating it:
751
752 >>> m = mean(data)
753 >>> variance(data, m)
754 1.3720238095238095
755
756 This function does not check that ``xbar`` is actually the mean of
757 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
758 impossible results.
759
760 Decimals and Fractions are supported:
761
762 >>> from decimal import Decimal as D
763 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
764 Decimal('31.01875')
765
766 >>> from fractions import Fraction as F
767 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
768 Fraction(67, 108)
769
770 """
771 if iter(data) is data:
772 data = list(data)
773 n = len(data)
774 if n < 2:
775 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100776 T, ss = _ss(data, xbar)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700777 return _convert(ss / (n - 1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700778
779
780def pvariance(data, mu=None):
781 """Return the population variance of ``data``.
782
Raymond Hettinger733b9a32019-11-11 23:35:06 -0800783 data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700784 value. The optional argument mu, if given, should be the mean of
785 the data. If it is missing or None, the mean is automatically calculated.
786
787 Use this function to calculate the variance from the entire population.
788 To estimate the variance from a sample, the ``variance`` function is
789 usually a better choice.
790
791 Examples:
792
793 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
794 >>> pvariance(data)
795 1.25
796
797 If you have already calculated the mean of the data, you can pass it as
798 the optional second argument to avoid recalculating it:
799
800 >>> mu = mean(data)
801 >>> pvariance(data, mu)
802 1.25
803
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700804 Decimals and Fractions are supported:
805
806 >>> from decimal import Decimal as D
807 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
808 Decimal('24.815')
809
810 >>> from fractions import Fraction as F
811 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
812 Fraction(13, 72)
813
814 """
815 if iter(data) is data:
816 data = list(data)
817 n = len(data)
818 if n < 1:
819 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100820 T, ss = _ss(data, mu)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700821 return _convert(ss / n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700822
823
824def stdev(data, xbar=None):
825 """Return the square root of the sample variance.
826
827 See ``variance`` for arguments and other details.
828
829 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
830 1.0810874155219827
831
832 """
833 var = variance(data, xbar)
834 try:
835 return var.sqrt()
836 except AttributeError:
837 return math.sqrt(var)
838
839
840def pstdev(data, mu=None):
841 """Return the square root of the population variance.
842
843 See ``pvariance`` for arguments and other details.
844
845 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
846 0.986893273527251
847
848 """
849 var = pvariance(data, mu)
850 try:
851 return var.sqrt()
852 except AttributeError:
853 return math.sqrt(var)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800854
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700855
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200856# === Statistics for relations between two inputs ===
857
858# See https://en.wikipedia.org/wiki/Covariance
859# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
860# https://en.wikipedia.org/wiki/Simple_linear_regression
861
862
863def covariance(x, y, /):
864 """Covariance
865
866 Return the sample covariance of two inputs *x* and *y*. Covariance
867 is a measure of the joint variability of two inputs.
868
869 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
870 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
871 >>> covariance(x, y)
872 0.75
873 >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
874 >>> covariance(x, z)
875 -7.5
876 >>> covariance(z, x)
877 -7.5
878
879 """
880 n = len(x)
881 if len(y) != n:
882 raise StatisticsError('covariance requires that both inputs have same number of data points')
883 if n < 2:
884 raise StatisticsError('covariance requires at least two data points')
885 xbar = mean(x)
886 ybar = mean(y)
887 total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
888 return total / (n - 1)
889
890
891def correlation(x, y, /):
892 """Pearson's correlation coefficient
893
894 Return the Pearson's correlation coefficient for two inputs. Pearson's
895 correlation coefficient *r* takes values between -1 and +1. It measures the
896 strength and direction of the linear relationship, where +1 means very
897 strong, positive linear relationship, -1 very strong, negative linear
898 relationship, and 0 no linear relationship.
899
900 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
901 >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
902 >>> correlation(x, x)
903 1.0
904 >>> correlation(x, y)
905 -1.0
906
907 """
908 n = len(x)
909 if len(y) != n:
910 raise StatisticsError('correlation requires that both inputs have same number of data points')
911 if n < 2:
912 raise StatisticsError('correlation requires at least two data points')
913 cov = covariance(x, y)
914 stdx = stdev(x)
915 stdy = stdev(y)
916 try:
917 return cov / (stdx * stdy)
918 except ZeroDivisionError:
919 raise StatisticsError('at least one of the inputs is constant')
920
921
922LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope'])
923
924
925def linear_regression(regressor, dependent_variable, /):
926 """Intercept and slope for simple linear regression
927
928 Return the intercept and slope of simple linear regression
929 parameters estimated using ordinary least squares. Simple linear
930 regression describes relationship between *regressor* and
931 *dependent variable* in terms of linear function::
932
933 dependent_variable = intercept + slope * regressor + noise
934
935 where ``intercept`` and ``slope`` are the regression parameters that are
936 estimated, and noise term is an unobserved random variable, for the
937 variability of the data that was not explained by the linear regression
938 (it is equal to the difference between prediction and the actual values
939 of dependent variable).
940
941 The parameters are returned as a named tuple.
942
943 >>> regressor = [1, 2, 3, 4, 5]
944 >>> noise = NormalDist().samples(5, seed=42)
945 >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)]
946 >>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS
947 LinearRegression(intercept=1.75684970486..., slope=3.09078914170...)
948
949 """
950 n = len(regressor)
951 if len(dependent_variable) != n:
952 raise StatisticsError('linear regression requires that both inputs have same number of data points')
953 if n < 2:
954 raise StatisticsError('linear regression requires at least two data points')
955 try:
956 slope = covariance(regressor, dependent_variable) / variance(regressor)
957 except ZeroDivisionError:
958 raise StatisticsError('regressor is constant')
959 intercept = mean(dependent_variable) - slope * mean(regressor)
960 return LinearRegression(intercept=intercept, slope=slope)
961
962
Raymond Hettinger11c79532019-02-23 14:44:07 -0800963## Normal Distribution #####################################################
964
Dong-hee Na0a18ee42019-08-24 07:20:30 +0900965
966def _normal_dist_inv_cdf(p, mu, sigma):
967 # There is no closed-form solution to the inverse CDF for the normal
968 # distribution, so we use a rational approximation instead:
969 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
970 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
971 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
972 q = p - 0.5
973 if fabs(q) <= 0.425:
974 r = 0.180625 - q * q
975 # Hash sum: 55.88319_28806_14901_4439
976 num = (((((((2.50908_09287_30122_6727e+3 * r +
977 3.34305_75583_58812_8105e+4) * r +
978 6.72657_70927_00870_0853e+4) * r +
979 4.59219_53931_54987_1457e+4) * r +
980 1.37316_93765_50946_1125e+4) * r +
981 1.97159_09503_06551_4427e+3) * r +
982 1.33141_66789_17843_7745e+2) * r +
983 3.38713_28727_96366_6080e+0) * q
984 den = (((((((5.22649_52788_52854_5610e+3 * r +
985 2.87290_85735_72194_2674e+4) * r +
986 3.93078_95800_09271_0610e+4) * r +
987 2.12137_94301_58659_5867e+4) * r +
988 5.39419_60214_24751_1077e+3) * r +
989 6.87187_00749_20579_0830e+2) * r +
990 4.23133_30701_60091_1252e+1) * r +
991 1.0)
992 x = num / den
993 return mu + (x * sigma)
994 r = p if q <= 0.0 else 1.0 - p
995 r = sqrt(-log(r))
996 if r <= 5.0:
997 r = r - 1.6
998 # Hash sum: 49.33206_50330_16102_89036
999 num = (((((((7.74545_01427_83414_07640e-4 * r +
1000 2.27238_44989_26918_45833e-2) * r +
1001 2.41780_72517_74506_11770e-1) * r +
1002 1.27045_82524_52368_38258e+0) * r +
1003 3.64784_83247_63204_60504e+0) * r +
1004 5.76949_72214_60691_40550e+0) * r +
1005 4.63033_78461_56545_29590e+0) * r +
1006 1.42343_71107_49683_57734e+0)
1007 den = (((((((1.05075_00716_44416_84324e-9 * r +
1008 5.47593_80849_95344_94600e-4) * r +
1009 1.51986_66563_61645_71966e-2) * r +
1010 1.48103_97642_74800_74590e-1) * r +
1011 6.89767_33498_51000_04550e-1) * r +
1012 1.67638_48301_83803_84940e+0) * r +
1013 2.05319_16266_37758_82187e+0) * r +
1014 1.0)
1015 else:
1016 r = r - 5.0
1017 # Hash sum: 47.52583_31754_92896_71629
1018 num = (((((((2.01033_43992_92288_13265e-7 * r +
1019 2.71155_55687_43487_57815e-5) * r +
1020 1.24266_09473_88078_43860e-3) * r +
1021 2.65321_89526_57612_30930e-2) * r +
1022 2.96560_57182_85048_91230e-1) * r +
1023 1.78482_65399_17291_33580e+0) * r +
1024 5.46378_49111_64114_36990e+0) * r +
1025 6.65790_46435_01103_77720e+0)
1026 den = (((((((2.04426_31033_89939_78564e-15 * r +
1027 1.42151_17583_16445_88870e-7) * r +
1028 1.84631_83175_10054_68180e-5) * r +
1029 7.86869_13114_56132_59100e-4) * r +
1030 1.48753_61290_85061_48525e-2) * r +
1031 1.36929_88092_27358_05310e-1) * r +
1032 5.99832_20655_58879_37690e-1) * r +
1033 1.0)
1034 x = num / den
1035 if q < 0.0:
1036 x = -x
1037 return mu + (x * sigma)
1038
1039
Raymond Hettinger0400a7f2020-05-02 19:30:24 -07001040# If available, use C implementation
1041try:
1042 from _statistics import _normal_dist_inv_cdf
1043except ImportError:
1044 pass
1045
1046
Raymond Hettinger11c79532019-02-23 14:44:07 -08001047class NormalDist:
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001048 "Normal distribution of a random variable"
Raymond Hettinger11c79532019-02-23 14:44:07 -08001049 # https://en.wikipedia.org/wiki/Normal_distribution
1050 # https://en.wikipedia.org/wiki/Variance#Properties
1051
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001052 __slots__ = {
1053 '_mu': 'Arithmetic mean of a normal distribution',
1054 '_sigma': 'Standard deviation of a normal distribution',
1055 }
Raymond Hettinger11c79532019-02-23 14:44:07 -08001056
1057 def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001058 "NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001059 if sigma < 0.0:
1060 raise StatisticsError('sigma must be non-negative')
Raymond Hettingere4810b22019-09-05 00:18:47 -07001061 self._mu = float(mu)
1062 self._sigma = float(sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001063
1064 @classmethod
1065 def from_samples(cls, data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001066 "Make a normal distribution instance from sample data."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001067 if not isinstance(data, (list, tuple)):
1068 data = list(data)
1069 xbar = fmean(data)
1070 return cls(xbar, stdev(data, xbar))
1071
Raymond Hettingerfb8c7d52019-04-23 01:46:18 -07001072 def samples(self, n, *, seed=None):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001073 "Generate *n* samples for a given mean and standard deviation."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001074 gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001075 mu, sigma = self._mu, self._sigma
Raymond Hettinger11c79532019-02-23 14:44:07 -08001076 return [gauss(mu, sigma) for i in range(n)]
1077
1078 def pdf(self, x):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001079 "Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001080 variance = self._sigma ** 2.0
Raymond Hettinger11c79532019-02-23 14:44:07 -08001081 if not variance:
1082 raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001083 return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001084
1085 def cdf(self, x):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001086 "Cumulative distribution function. P(X <= x)"
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001087 if not self._sigma:
Raymond Hettinger11c79532019-02-23 14:44:07 -08001088 raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001089 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001090
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001091 def inv_cdf(self, p):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001092 """Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001093
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001094 Finds the value of the random variable such that the probability of
1095 the variable being less than or equal to that value equals the given
1096 probability.
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001097
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001098 This function is also called the percent point function or quantile
1099 function.
1100 """
1101 if p <= 0.0 or p >= 1.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001102 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001103 if self._sigma <= 0.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001104 raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na0a18ee42019-08-24 07:20:30 +09001105 return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001106
Raymond Hettinger4db25d52019-09-08 16:57:58 -07001107 def quantiles(self, n=4):
1108 """Divide into *n* continuous intervals with equal probability.
1109
1110 Returns a list of (n - 1) cut points separating the intervals.
1111
1112 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
1113 Set *n* to 100 for percentiles which gives the 99 cuts points that
1114 separate the normal distribution in to 100 equal sized groups.
1115 """
1116 return [self.inv_cdf(i / n) for i in range(1, n)]
1117
Raymond Hettinger318d5372019-03-06 22:59:40 -08001118 def overlap(self, other):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001119 """Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger318d5372019-03-06 22:59:40 -08001120
1121 Measures the agreement between two normal probability distributions.
1122 Returns a value between 0.0 and 1.0 giving the overlapping area in
1123 the two underlying probability density functions.
1124
1125 >>> N1 = NormalDist(2.4, 1.6)
1126 >>> N2 = NormalDist(3.2, 2.0)
1127 >>> N1.overlap(N2)
1128 0.8035050657330205
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001129 """
Raymond Hettinger318d5372019-03-06 22:59:40 -08001130 # See: "The overlapping coefficient as a measure of agreement between
1131 # probability distributions and point estimation of the overlap of two
1132 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1133 # http://dx.doi.org/10.1080/03610928908830127
1134 if not isinstance(other, NormalDist):
1135 raise TypeError('Expected another NormalDist instance')
1136 X, Y = self, other
Raymond Hettinger5aad0272020-06-13 19:17:28 -07001137 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger318d5372019-03-06 22:59:40 -08001138 X, Y = Y, X
1139 X_var, Y_var = X.variance, Y.variance
1140 if not X_var or not Y_var:
1141 raise StatisticsError('overlap() not defined when sigma is zero')
1142 dv = Y_var - X_var
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001143 dm = fabs(Y._mu - X._mu)
Raymond Hettinger318d5372019-03-06 22:59:40 -08001144 if not dv:
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001145 return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
1146 a = X._mu * Y_var - Y._mu * X_var
1147 b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
Raymond Hettinger318d5372019-03-06 22:59:40 -08001148 x1 = (a + b) / dv
1149 x2 = (a - b) / dv
1150 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1151
Raymond Hettinger70f027d2020-04-16 10:25:14 -07001152 def zscore(self, x):
1153 """Compute the Standard Score. (x - mean) / stdev
1154
1155 Describes *x* in terms of the number of standard deviations
1156 above or below the mean of the normal distribution.
1157 """
1158 # https://www.statisticshowto.com/probability-and-statistics/z-score/
1159 if not self._sigma:
1160 raise StatisticsError('zscore() not defined when sigma is zero')
1161 return (x - self._mu) / self._sigma
1162
Raymond Hettinger11c79532019-02-23 14:44:07 -08001163 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001164 def mean(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001165 "Arithmetic mean of the normal distribution."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001166 return self._mu
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001167
1168 @property
Raymond Hettinger4db25d52019-09-08 16:57:58 -07001169 def median(self):
1170 "Return the median of the normal distribution"
1171 return self._mu
1172
1173 @property
1174 def mode(self):
1175 """Return the mode of the normal distribution
1176
1177 The mode is the value x where which the probability density
1178 function (pdf) takes its maximum value.
1179 """
1180 return self._mu
1181
1182 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001183 def stdev(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001184 "Standard deviation of the normal distribution."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001185 return self._sigma
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001186
1187 @property
Raymond Hettinger11c79532019-02-23 14:44:07 -08001188 def variance(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001189 "Square of the standard deviation."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001190 return self._sigma ** 2.0
Raymond Hettinger11c79532019-02-23 14:44:07 -08001191
1192 def __add__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001193 """Add a constant or another NormalDist instance.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001194
1195 If *other* is a constant, translate mu by the constant,
1196 leaving sigma unchanged.
1197
1198 If *other* is a NormalDist, add both the means and the variances.
1199 Mathematically, this works only if the two distributions are
1200 independent or if they are jointly normally distributed.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001201 """
Raymond Hettinger11c79532019-02-23 14:44:07 -08001202 if isinstance(x2, NormalDist):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001203 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1204 return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001205
1206 def __sub__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001207 """Subtract a constant or another NormalDist instance.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001208
1209 If *other* is a constant, translate by the constant mu,
1210 leaving sigma unchanged.
1211
1212 If *other* is a NormalDist, subtract the means and add the variances.
1213 Mathematically, this works only if the two distributions are
1214 independent or if they are jointly normally distributed.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001215 """
Raymond Hettinger11c79532019-02-23 14:44:07 -08001216 if isinstance(x2, NormalDist):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001217 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1218 return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001219
1220 def __mul__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001221 """Multiply both mu and sigma by a constant.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001222
1223 Used for rescaling, perhaps to change measurement units.
1224 Sigma is scaled with the absolute value of the constant.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001225 """
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001226 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001227
1228 def __truediv__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001229 """Divide both mu and sigma by a constant.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001230
1231 Used for rescaling, perhaps to change measurement units.
1232 Sigma is scaled with the absolute value of the constant.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001233 """
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001234 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001235
1236 def __pos__(x1):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001237 "Return a copy of the instance."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001238 return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001239
1240 def __neg__(x1):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001241 "Negates mu while keeping sigma the same."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001242 return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001243
1244 __radd__ = __add__
1245
1246 def __rsub__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001247 "Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001248 return -(x1 - x2)
1249
1250 __rmul__ = __mul__
1251
1252 def __eq__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001253 "Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001254 if not isinstance(x2, NormalDist):
1255 return NotImplemented
Raymond Hettinger5eabec02019-10-18 14:20:35 -07001256 return x1._mu == x2._mu and x1._sigma == x2._sigma
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001257
1258 def __hash__(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001259 "NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001260 return hash((self._mu, self._sigma))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001261
1262 def __repr__(self):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001263 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'