blob: cfcc456fd786efafb0f732020956ff052d4b4aa1 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070010================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070011Function Description
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070012================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070013mean Arithmetic mean (average) of data.
Raymond Hettinger72800482019-04-23 01:35:16 -070014fmean Fast, floating point arithmetic mean.
Raymond Hettinger6463ba32019-04-07 09:20:03 -070015geometric_mean Geometric mean of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100016harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070017median Median (middle value) of data.
18median_low Low median of data.
19median_high High median of data.
20median_grouped Median, or 50th percentile, of grouped data.
21mode Mode (most common value) of data.
Raymond Hettinger6463ba32019-04-07 09:20:03 -070022multimode List of modes (most common values of data).
Raymond Hettinger9013ccf2019-04-23 00:06:35 -070023quantiles Divide data into intervals with equal probability.
24================== ==================================================
Larry Hastingsf5e987b2013-10-19 11:50:09 -070025
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53================== =============================================
54Function Description
55================== =============================================
56pvariance Population variance of data.
57variance Sample variance of data.
58pstdev Population standard deviation of data.
59stdev Sample standard deviation of data.
60================== =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +020076Statistics for relations between two inputs
77-------------------------------------------
78
79================== ====================================================
80Function Description
81================== ====================================================
82covariance Sample covariance for two variables.
83correlation Pearson's correlation coefficient for two variables.
84linear_regression Intercept and slope for simple linear regression.
85================== ====================================================
86
87Calculate covariance, Pearson's correlation, and simple linear regression
88for two inputs:
89
90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92>>> covariance(x, y)
930.75
94>>> correlation(x, y) #doctest: +ELLIPSIS
950.31622776601...
96>>> linear_regression(x, y) #doctest:
Miss Islington (bot)86779872021-05-24 18:11:12 -070097LinearRegression(slope=0.1, intercept=1.5)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +020098
99
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700100Exceptions
101----------
102
103A single exception is defined: StatisticsError is a subclass of ValueError.
104
105"""
106
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700107__all__ = [
108 'NormalDist',
109 'StatisticsError',
Miss Islington (bot)5442cfa2021-06-04 18:49:29 -0700110 'correlation',
111 'covariance',
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700112 'fmean',
113 'geometric_mean',
114 'harmonic_mean',
Miss Islington (bot)5442cfa2021-06-04 18:49:29 -0700115 'linear_regression',
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700116 'mean',
117 'median',
118 'median_grouped',
119 'median_high',
120 'median_low',
121 'mode',
122 'multimode',
123 'pstdev',
124 'pvariance',
125 'quantiles',
126 'stdev',
127 'variance',
128]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700129
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700130import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000131import numbers
Raymond Hettinger11c79532019-02-23 14:44:07 -0800132import random
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134from fractions import Fraction
135from decimal import Decimal
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800136from itertools import groupby, repeat
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000137from bisect import bisect_left, bisect_right
Raymond Hettinger318d5372019-03-06 22:59:40 -0800138from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700139from operator import itemgetter
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200140from collections import Counter, namedtuple
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141
142# === Exceptions ===
143
144class StatisticsError(ValueError):
145 pass
146
147
148# === Private utilities ===
149
Raymond Hettinger3c308052021-09-08 22:42:29 -0500150def _sum(data):
151 """_sum(data) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 Return a high-precision sum of the given numeric data as a fraction,
154 together with the type to be converted to and the count of items.
155
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700156 Examples
157 --------
158
Raymond Hettinger3c308052021-09-08 22:42:29 -0500159 >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
160 (<class 'float'>, Fraction(19, 2), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700161
162 Some sources of round-off error will be avoided:
163
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000164 # Built-in sum returns zero.
165 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700166 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700167
168 Fractions and Decimals are also supported:
169
170 >>> from fractions import Fraction as F
171 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700172 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700173
174 >>> from decimal import Decimal as D
175 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
176 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700177 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700178
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000179 Mixed types are currently treated as an error, except that int is
180 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700181 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100182 count = 0
Raymond Hettinger3c308052021-09-08 22:42:29 -0500183 partials = {}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700184 partials_get = partials.get
Raymond Hettinger3c308052021-09-08 22:42:29 -0500185 T = int
Steven D'Apranob28c3272015-12-01 19:59:53 +1100186 for typ, values in groupby(data, type):
187 T = _coerce(T, typ) # or raise TypeError
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700188 for n, d in map(_exact_ratio, values):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100189 count += 1
190 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700191 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100192 # The sum will be a NAN or INF. We can ignore all the finite
193 # partials, and just look at this special one.
194 total = partials[None]
195 assert not _isfinite(total)
196 else:
197 # Sum all the partial sums using builtin sum.
Raymond Hettinger3c308052021-09-08 22:42:29 -0500198 total = sum(Fraction(n, d) for d, n in partials.items())
Steven D'Apranob28c3272015-12-01 19:59:53 +1100199 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700200
201
Steven D'Apranob28c3272015-12-01 19:59:53 +1100202def _isfinite(x):
203 try:
204 return x.is_finite() # Likely a Decimal.
205 except AttributeError:
206 return math.isfinite(x) # Coerces to float first.
207
208
209def _coerce(T, S):
210 """Coerce types T and S to a common type, or raise TypeError.
211
212 Coercion rules are currently an implementation detail. See the CoerceTest
213 test class in test_statistics for details.
214 """
215 # See http://bugs.python.org/issue24068.
216 assert T is not bool, "initial type T is bool"
217 # If the types are the same, no need to coerce anything. Put this
218 # first, so that the usual case (no coercion needed) happens as soon
219 # as possible.
220 if T is S: return T
221 # Mixed int & other coerce to the other type.
222 if S is int or S is bool: return T
223 if T is int: return S
224 # If one is a (strict) subclass of the other, coerce to the subclass.
225 if issubclass(S, T): return S
226 if issubclass(T, S): return T
227 # Ints coerce to the other type.
228 if issubclass(T, int): return S
229 if issubclass(S, int): return T
230 # Mixed fraction & float coerces to float (or float subclass).
231 if issubclass(T, Fraction) and issubclass(S, float):
232 return S
233 if issubclass(T, float) and issubclass(S, Fraction):
234 return T
235 # Any other combination is disallowed.
236 msg = "don't know how to coerce %s and %s"
237 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000238
239
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700240def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100241 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700242
243 >>> _exact_ratio(0.25)
244 (1, 4)
245
246 x is expected to be an int, Fraction, Decimal or float.
247 """
248 try:
Raymond Hettinger3c308052021-09-08 22:42:29 -0500249 return x.as_integer_ratio()
250 except AttributeError:
251 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700252 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100253 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000254 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700255 return (x, None)
Raymond Hettinger3c308052021-09-08 22:42:29 -0500256 try:
257 # x may be an Integral ABC.
258 return (x.numerator, x.denominator)
259 except AttributeError:
260 msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
261 raise TypeError(msg)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700262
263
Steven D'Apranob28c3272015-12-01 19:59:53 +1100264def _convert(value, T):
265 """Convert value to given numeric type T."""
266 if type(value) is T:
267 # This covers the cases where T is Fraction, or where value is
268 # a NAN or INF (Decimal or float).
269 return value
270 if issubclass(T, int) and value.denominator != 1:
271 T = float
272 try:
273 # FIXME: what do we do if this overflows?
274 return T(value)
275 except TypeError:
276 if issubclass(T, Decimal):
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700277 return T(value.numerator) / T(value.denominator)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100278 else:
279 raise
280
281
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000282def _find_lteq(a, x):
283 'Locate the leftmost value exactly equal to x'
284 i = bisect_left(a, x)
285 if i != len(a) and a[i] == x:
286 return i
287 raise ValueError
288
289
290def _find_rteq(a, l, x):
291 'Locate the rightmost value exactly equal to x'
292 i = bisect_right(a, x, lo=l)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700293 if i != (len(a) + 1) and a[i - 1] == x:
294 return i - 1
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000295 raise ValueError
296
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000297
298def _fail_neg(values, errmsg='negative value'):
299 """Iterate over values, failing if any are less than zero."""
300 for x in values:
301 if x < 0:
302 raise StatisticsError(errmsg)
303 yield x
304
305
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700306# === Measures of central tendency (averages) ===
307
308def mean(data):
309 """Return the sample arithmetic mean of data.
310
311 >>> mean([1, 2, 3, 4, 4])
312 2.8
313
314 >>> from fractions import Fraction as F
315 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
316 Fraction(13, 21)
317
318 >>> from decimal import Decimal as D
319 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
320 Decimal('0.5625')
321
322 If ``data`` is empty, StatisticsError will be raised.
323 """
324 if iter(data) is data:
325 data = list(data)
326 n = len(data)
327 if n < 1:
328 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100329 T, total, count = _sum(data)
330 assert count == n
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700331 return _convert(total / n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700332
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700333
Raymond Hettinger47d99872019-02-21 15:06:29 -0800334def fmean(data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700335 """Convert data to floats and compute the arithmetic mean.
Raymond Hettinger47d99872019-02-21 15:06:29 -0800336
337 This runs faster than the mean() function and it always returns a float.
Raymond Hettinger47d99872019-02-21 15:06:29 -0800338 If the input dataset is empty, it raises a StatisticsError.
339
340 >>> fmean([3.5, 4.0, 5.25])
341 4.25
Raymond Hettinger47d99872019-02-21 15:06:29 -0800342 """
343 try:
344 n = len(data)
345 except TypeError:
346 # Handle iterators that do not define __len__().
347 n = 0
Raymond Hettinger6c01ebc2019-06-05 07:39:38 -0700348 def count(iterable):
Raymond Hettinger47d99872019-02-21 15:06:29 -0800349 nonlocal n
Raymond Hettinger6c01ebc2019-06-05 07:39:38 -0700350 for n, x in enumerate(iterable, start=1):
351 yield x
352 total = fsum(count(data))
Raymond Hettinger47d99872019-02-21 15:06:29 -0800353 else:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700354 total = fsum(data)
Raymond Hettinger47d99872019-02-21 15:06:29 -0800355 try:
356 return total / n
357 except ZeroDivisionError:
358 raise StatisticsError('fmean requires at least one data point') from None
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700359
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700360
Raymond Hettinger6463ba32019-04-07 09:20:03 -0700361def geometric_mean(data):
362 """Convert data to floats and compute the geometric mean.
363
364 Raises a StatisticsError if the input dataset is empty,
365 if it contains a zero, or if it contains a negative value.
366
367 No special efforts are made to achieve exact results.
368 (However, this may change in the future.)
369
370 >>> round(geometric_mean([54, 24, 36]), 9)
371 36.0
372 """
373 try:
374 return exp(fmean(map(log, data)))
375 except ValueError:
376 raise StatisticsError('geometric mean requires a non-empty dataset '
377 ' containing positive numbers') from None
378
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700379
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800380def harmonic_mean(data, weights=None):
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000381 """Return the harmonic mean of data.
382
Raymond Hettinger30a8b282021-02-07 16:44:42 -0800383 The harmonic mean is the reciprocal of the arithmetic mean of the
384 reciprocals of the data. It can be used for averaging ratios or
385 rates, for example speeds.
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000386
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800387 Suppose a car travels 40 km/hr for 5 km and then speeds-up to
388 60 km/hr for another 5 km. What is the average speed?
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000389
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800390 >>> harmonic_mean([40, 60])
391 48.0
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000392
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800393 Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
394 speeds-up to 60 km/hr for the remaining 30 km of the journey. What
395 is the average speed?
396
397 >>> harmonic_mean([40, 60], weights=[5, 30])
398 56.0
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000399
400 If ``data`` is empty, or any element is less than zero,
401 ``harmonic_mean`` will raise ``StatisticsError``.
402 """
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000403 if iter(data) is data:
404 data = list(data)
405 errmsg = 'harmonic mean does not support negative values'
406 n = len(data)
407 if n < 1:
408 raise StatisticsError('harmonic_mean requires at least one data point')
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800409 elif n == 1 and weights is None:
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000410 x = data[0]
411 if isinstance(x, (numbers.Real, Decimal)):
412 if x < 0:
413 raise StatisticsError(errmsg)
414 return x
415 else:
416 raise TypeError('unsupported type')
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800417 if weights is None:
418 weights = repeat(1, n)
419 sum_weights = n
420 else:
421 if iter(weights) is weights:
422 weights = list(weights)
423 if len(weights) != n:
424 raise StatisticsError('Number of weights does not match data size')
425 _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000426 try:
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800427 data = _fail_neg(data, errmsg)
428 T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000429 except ZeroDivisionError:
430 return 0
Raymond Hettingercc3467a2020-12-23 19:52:09 -0800431 if total <= 0:
432 raise StatisticsError('Weighted sum must be positive')
433 return _convert(sum_weights / total, T)
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000434
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700435# FIXME: investigate ways to calculate medians without sorting? Quickselect?
436def median(data):
437 """Return the median (middle value) of numeric data.
438
439 When the number of data points is odd, return the middle data point.
440 When the number of data points is even, the median is interpolated by
441 taking the average of the two middle values:
442
443 >>> median([1, 3, 5])
444 3
445 >>> median([1, 3, 5, 7])
446 4.0
447
448 """
449 data = sorted(data)
450 n = len(data)
451 if n == 0:
452 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700453 if n % 2 == 1:
454 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700455 else:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700456 i = n // 2
457 return (data[i - 1] + data[i]) / 2
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700458
459
460def median_low(data):
461 """Return the low median of numeric data.
462
463 When the number of data points is odd, the middle value is returned.
464 When it is even, the smaller of the two middle values is returned.
465
466 >>> median_low([1, 3, 5])
467 3
468 >>> median_low([1, 3, 5, 7])
469 3
470
471 """
472 data = sorted(data)
473 n = len(data)
474 if n == 0:
475 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700476 if n % 2 == 1:
477 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700478 else:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700479 return data[n // 2 - 1]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700480
481
482def median_high(data):
483 """Return the high median of data.
484
485 When the number of data points is odd, the middle value is returned.
486 When it is even, the larger of the two middle values is returned.
487
488 >>> median_high([1, 3, 5])
489 3
490 >>> median_high([1, 3, 5, 7])
491 5
492
493 """
494 data = sorted(data)
495 n = len(data)
496 if n == 0:
497 raise StatisticsError("no median for empty data")
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700498 return data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700499
500
501def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500502 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700503
504 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
505 3.7
506 >>> median_grouped([52, 52, 53, 54])
507 52.5
508
509 This calculates the median as the 50th percentile, and should be
510 used when your data is continuous and grouped. In the above example,
511 the values 1, 2, 3, etc. actually represent the midpoint of classes
512 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
513 class 3.5-4.5, and interpolation is used to estimate it.
514
515 Optional argument ``interval`` represents the class interval, and
516 defaults to 1. Changing the class interval naturally will change the
517 interpolated 50th percentile value:
518
519 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
520 3.25
521 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
522 3.5
523
524 This function does not check whether the data points are at least
525 ``interval`` apart.
526 """
527 data = sorted(data)
528 n = len(data)
529 if n == 0:
530 raise StatisticsError("no median for empty data")
531 elif n == 1:
532 return data[0]
533 # Find the value at the midpoint. Remember this corresponds to the
534 # centre of the class interval.
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700535 x = data[n // 2]
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700536 for obj in (x, interval):
537 if isinstance(obj, (str, bytes)):
538 raise TypeError('expected number but got %r' % obj)
539 try:
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700540 L = x - interval / 2 # The lower limit of the median interval.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700541 except TypeError:
542 # Mixed type. For now we just coerce to float.
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700543 L = float(x) - float(interval) / 2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000544
545 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000546 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000547 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000548 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000549 # Assuming always l1 <= l2
550 l2 = _find_rteq(data, l1, x)
551 cf = l1
552 f = l2 - l1 + 1
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700553 return L + interval * (n / 2 - cf) / f
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700554
555
556def mode(data):
557 """Return the most common data point from discrete or nominal data.
558
559 ``mode`` assumes discrete data, and returns a single value. This is the
560 standard treatment of the mode as commonly taught in schools:
561
Raymond Hettingere4810b22019-09-05 00:18:47 -0700562 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
563 3
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700564
565 This also works with nominal (non-numeric) data:
566
Raymond Hettingere4810b22019-09-05 00:18:47 -0700567 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
568 'red'
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700569
Raymond Hettingere4810b22019-09-05 00:18:47 -0700570 If there are multiple modes with same frequency, return the first one
571 encountered:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700572
573 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
574 'red'
575
576 If *data* is empty, ``mode``, raises StatisticsError.
577
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700578 """
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700579 pairs = Counter(iter(data)).most_common(1)
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700580 try:
Raymond Hettinger7ce4bfa2019-09-20 21:46:52 -0700581 return pairs[0][0]
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700582 except IndexError:
583 raise StatisticsError('no mode for empty data') from None
584
585
586def multimode(data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700587 """Return a list of the most frequently occurring values.
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700588
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700589 Will return more than one result if there are multiple modes
590 or an empty list if *data* is empty.
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700591
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700592 >>> multimode('aabbbbbbbbcc')
593 ['b']
594 >>> multimode('aabbbbccddddeeffffgg')
595 ['b', 'd', 'f']
596 >>> multimode('')
597 []
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700598 """
599 counts = Counter(iter(data)).most_common()
600 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
601 return list(map(itemgetter(0), mode_items))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700602
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700603
Raymond Hettingercba9f842019-06-02 21:07:43 -0700604# Notes on methods for computing quantiles
605# ----------------------------------------
606#
607# There is no one perfect way to compute quantiles. Here we offer
608# two methods that serve common needs. Most other packages
609# surveyed offered at least one or both of these two, making them
610# "standard" in the sense of "widely-adopted and reproducible".
611# They are also easy to explain, easy to compute manually, and have
612# straight-forward interpretations that aren't surprising.
613
614# The default method is known as "R6", "PERCENTILE.EXC", or "expected
615# value of rank order statistics". The alternative method is known as
616# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
617
618# For sample data where there is a positive probability for values
619# beyond the range of the data, the R6 exclusive method is a
620# reasonable choice. Consider a random sample of nine values from a
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700621# population with a uniform distribution from 0.0 to 1.0. The
Raymond Hettingercba9f842019-06-02 21:07:43 -0700622# distribution of the third ranked sample point is described by
623# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
624# mean=0.300. Only the latter (which corresponds with R6) gives the
625# desired cut point with 30% of the population falling below that
626# value, making it comparable to a result from an inv_cdf() function.
Raymond Hettinger7ce4bfa2019-09-20 21:46:52 -0700627# The R6 exclusive method is also idempotent.
Raymond Hettingercba9f842019-06-02 21:07:43 -0700628
629# For describing population data where the end points are known to
630# be included in the data, the R7 inclusive method is a reasonable
631# choice. Instead of the mean, it uses the mode of the beta
632# distribution for the interior points. Per Hyndman & Fan, "One nice
633# property is that the vertices of Q7(p) divide the range into n - 1
634# intervals, and exactly 100p% of the intervals lie to the left of
635# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
636
Raymond Hettingereed5e9a2019-07-19 01:57:22 -0700637# If needed, other methods could be added. However, for now, the
638# position is that fewer options make for easier choices and that
639# external packages can be used for anything more advanced.
Raymond Hettingercba9f842019-06-02 21:07:43 -0700640
Raymond Hettinger272d0d02019-09-17 20:45:05 -0700641def quantiles(data, *, n=4, method='exclusive'):
Raymond Hettingere4810b22019-09-05 00:18:47 -0700642 """Divide *data* into *n* continuous intervals with equal probability.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700643
644 Returns a list of (n - 1) cut points separating the intervals.
645
646 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
647 Set *n* to 100 for percentiles which gives the 99 cuts points that
Raymond Hettingere4810b22019-09-05 00:18:47 -0700648 separate *data* in to 100 equal sized groups.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700649
Raymond Hettinger4db25d52019-09-08 16:57:58 -0700650 The *data* can be any iterable containing sample.
651 The cut points are linearly interpolated between data points.
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700652
Raymond Hettingere4810b22019-09-05 00:18:47 -0700653 If *method* is set to *inclusive*, *data* is treated as population
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700654 data. The minimum value is treated as the 0th percentile and the
655 maximum value is treated as the 100th percentile.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700656 """
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700657 if n < 1:
658 raise StatisticsError('n must be at least 1')
Raymond Hettingere4810b22019-09-05 00:18:47 -0700659 data = sorted(data)
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700660 ld = len(data)
661 if ld < 2:
662 raise StatisticsError('must have at least two data points')
663 if method == 'inclusive':
664 m = ld - 1
665 result = []
666 for i in range(1, n):
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700667 j, delta = divmod(i * m, n)
668 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700669 result.append(interpolated)
670 return result
671 if method == 'exclusive':
672 m = ld + 1
673 result = []
674 for i in range(1, n):
675 j = i * m // n # rescale i to m/n
676 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
677 delta = i*m - j*n # exact integer math
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700678 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
Raymond Hettinger9013ccf2019-04-23 00:06:35 -0700679 result.append(interpolated)
680 return result
681 raise ValueError(f'Unknown method: {method!r}')
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700682
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700683
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700684# === Measures of spread ===
685
686# See http://mathworld.wolfram.com/Variance.html
687# http://mathworld.wolfram.com/SampleVariance.html
688# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
689#
690# Under no circumstances use the so-called "computational formula for
691# variance", as that is only suitable for hand calculations with a small
692# amount of low-precision data. It has terrible numeric properties.
693#
694# See a comparison of three computational methods here:
695# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
696
697def _ss(data, c=None):
698 """Return sum of square deviations of sequence data.
699
700 If ``c`` is None, the mean is calculated in one pass, and the deviations
701 from the mean are calculated in a second pass. Otherwise, deviations are
702 calculated from ``c`` as given. Use the second case with care, as it can
703 lead to garbage results.
704 """
Raymond Hettingerd71ab4f2020-06-13 15:55:52 -0700705 if c is not None:
706 T, total, count = _sum((x-c)**2 for x in data)
707 return (T, total)
Raymond Hettinger3c308052021-09-08 22:42:29 -0500708 T, total, count = _sum(data)
709 mean_n, mean_d = (total / count).as_integer_ratio()
710 partials = Counter()
711 for n, d in map(_exact_ratio, data):
712 diff_n = n * mean_d - d * mean_n
713 diff_d = d * mean_d
714 partials[diff_d * diff_d] += diff_n * diff_n
715 if None in partials:
716 # The sum will be a NAN or INF. We can ignore all the finite
717 # partials, and just look at this special one.
718 total = partials[None]
719 assert not _isfinite(total)
720 else:
721 total = sum(Fraction(n, d) for d, n in partials.items())
Steven D'Apranob28c3272015-12-01 19:59:53 +1100722 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700723
724
725def variance(data, xbar=None):
726 """Return the sample variance of data.
727
728 data should be an iterable of Real-valued numbers, with at least two
729 values. The optional argument xbar, if given, should be the mean of
730 the data. If it is missing or None, the mean is automatically calculated.
731
732 Use this function when your data is a sample from a population. To
733 calculate the variance from the entire population, see ``pvariance``.
734
735 Examples:
736
737 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
738 >>> variance(data)
739 1.3720238095238095
740
741 If you have already calculated the mean of your data, you can pass it as
742 the optional second argument ``xbar`` to avoid recalculating it:
743
744 >>> m = mean(data)
745 >>> variance(data, m)
746 1.3720238095238095
747
748 This function does not check that ``xbar`` is actually the mean of
749 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
750 impossible results.
751
752 Decimals and Fractions are supported:
753
754 >>> from decimal import Decimal as D
755 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
756 Decimal('31.01875')
757
758 >>> from fractions import Fraction as F
759 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
760 Fraction(67, 108)
761
762 """
763 if iter(data) is data:
764 data = list(data)
765 n = len(data)
766 if n < 2:
767 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100768 T, ss = _ss(data, xbar)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700769 return _convert(ss / (n - 1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700770
771
772def pvariance(data, mu=None):
773 """Return the population variance of ``data``.
774
Raymond Hettinger733b9a32019-11-11 23:35:06 -0800775 data should be a sequence or iterable of Real-valued numbers, with at least one
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700776 value. The optional argument mu, if given, should be the mean of
777 the data. If it is missing or None, the mean is automatically calculated.
778
779 Use this function to calculate the variance from the entire population.
780 To estimate the variance from a sample, the ``variance`` function is
781 usually a better choice.
782
783 Examples:
784
785 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
786 >>> pvariance(data)
787 1.25
788
789 If you have already calculated the mean of the data, you can pass it as
790 the optional second argument to avoid recalculating it:
791
792 >>> mu = mean(data)
793 >>> pvariance(data, mu)
794 1.25
795
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700796 Decimals and Fractions are supported:
797
798 >>> from decimal import Decimal as D
799 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
800 Decimal('24.815')
801
802 >>> from fractions import Fraction as F
803 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
804 Fraction(13, 72)
805
806 """
807 if iter(data) is data:
808 data = list(data)
809 n = len(data)
810 if n < 1:
811 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100812 T, ss = _ss(data, mu)
Raymond Hettinger5aad0272020-06-13 19:17:28 -0700813 return _convert(ss / n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700814
815
816def stdev(data, xbar=None):
817 """Return the square root of the sample variance.
818
819 See ``variance`` for arguments and other details.
820
821 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
822 1.0810874155219827
823
824 """
Raymond Hettinger3c308052021-09-08 22:42:29 -0500825 # Fixme: Despite the exact sum of squared deviations, some inaccuracy
826 # remain because there are two rounding steps. The first occurs in
827 # the _convert() step for variance(), the second occurs in math.sqrt().
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700828 var = variance(data, xbar)
829 try:
830 return var.sqrt()
831 except AttributeError:
832 return math.sqrt(var)
833
834
835def pstdev(data, mu=None):
836 """Return the square root of the population variance.
837
838 See ``pvariance`` for arguments and other details.
839
840 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
841 0.986893273527251
842
843 """
Raymond Hettinger3c308052021-09-08 22:42:29 -0500844 # Fixme: Despite the exact sum of squared deviations, some inaccuracy
845 # remain because there are two rounding steps. The first occurs in
846 # the _convert() step for pvariance(), the second occurs in math.sqrt().
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700847 var = pvariance(data, mu)
848 try:
849 return var.sqrt()
850 except AttributeError:
851 return math.sqrt(var)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800852
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -0700853
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200854# === Statistics for relations between two inputs ===
855
856# See https://en.wikipedia.org/wiki/Covariance
857# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
858# https://en.wikipedia.org/wiki/Simple_linear_regression
859
860
861def covariance(x, y, /):
862 """Covariance
863
864 Return the sample covariance of two inputs *x* and *y*. Covariance
865 is a measure of the joint variability of two inputs.
866
867 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
868 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
869 >>> covariance(x, y)
870 0.75
871 >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
872 >>> covariance(x, z)
873 -7.5
874 >>> covariance(z, x)
875 -7.5
876
877 """
878 n = len(x)
879 if len(y) != n:
880 raise StatisticsError('covariance requires that both inputs have same number of data points')
881 if n < 2:
882 raise StatisticsError('covariance requires at least two data points')
Miss Islington (bot)5442cfa2021-06-04 18:49:29 -0700883 xbar = fsum(x) / n
884 ybar = fsum(y) / n
885 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
886 return sxy / (n - 1)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200887
888
889def correlation(x, y, /):
890 """Pearson's correlation coefficient
891
892 Return the Pearson's correlation coefficient for two inputs. Pearson's
893 correlation coefficient *r* takes values between -1 and +1. It measures the
894 strength and direction of the linear relationship, where +1 means very
895 strong, positive linear relationship, -1 very strong, negative linear
896 relationship, and 0 no linear relationship.
897
898 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
899 >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
900 >>> correlation(x, x)
901 1.0
902 >>> correlation(x, y)
903 -1.0
904
905 """
906 n = len(x)
907 if len(y) != n:
908 raise StatisticsError('correlation requires that both inputs have same number of data points')
909 if n < 2:
910 raise StatisticsError('correlation requires at least two data points')
Miss Islington (bot)5442cfa2021-06-04 18:49:29 -0700911 xbar = fsum(x) / n
912 ybar = fsum(y) / n
913 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
Miss Islington (bot)4642caf2021-06-04 19:38:30 -0700914 sxx = fsum((xi - xbar) ** 2.0 for xi in x)
915 syy = fsum((yi - ybar) ** 2.0 for yi in y)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200916 try:
Miss Islington (bot)4642caf2021-06-04 19:38:30 -0700917 return sxy / sqrt(sxx * syy)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200918 except ZeroDivisionError:
919 raise StatisticsError('at least one of the inputs is constant')
920
921
Miss Islington (bot)86779872021-05-24 18:11:12 -0700922LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200923
924
Miss Islington (bot)86779872021-05-24 18:11:12 -0700925def linear_regression(x, y, /):
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700926 """Slope and intercept for simple linear regression.
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200927
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700928 Return the slope and intercept of simple linear regression
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200929 parameters estimated using ordinary least squares. Simple linear
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700930 regression describes relationship between an independent variable
931 *x* and a dependent variable *y* in terms of linear function:
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200932
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700933 y = slope * x + intercept + noise
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200934
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700935 where *slope* and *intercept* are the regression parameters that are
Miss Islington (bot)e6755ba2021-05-16 19:47:57 -0700936 estimated, and noise represents the variability of the data that was
937 not explained by the linear regression (it is equal to the
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700938 difference between predicted and actual values of the dependent
Miss Islington (bot)e6755ba2021-05-16 19:47:57 -0700939 variable).
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200940
941 The parameters are returned as a named tuple.
942
Miss Islington (bot)86779872021-05-24 18:11:12 -0700943 >>> x = [1, 2, 3, 4, 5]
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200944 >>> noise = NormalDist().samples(5, seed=42)
Miss Islington (bot)a6825192021-05-24 23:23:10 -0700945 >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
Miss Islington (bot)86779872021-05-24 18:11:12 -0700946 >>> linear_regression(x, y) #doctest: +ELLIPSIS
947 LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200948
949 """
Miss Islington (bot)86779872021-05-24 18:11:12 -0700950 n = len(x)
951 if len(y) != n:
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200952 raise StatisticsError('linear regression requires that both inputs have same number of data points')
953 if n < 2:
954 raise StatisticsError('linear regression requires at least two data points')
Miss Islington (bot)8e3cb612021-05-06 08:26:55 -0700955 xbar = fsum(x) / n
956 ybar = fsum(y) / n
957 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
Miss Islington (bot)4642caf2021-06-04 19:38:30 -0700958 sxx = fsum((xi - xbar) ** 2.0 for xi in x)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200959 try:
Miss Islington (bot)4642caf2021-06-04 19:38:30 -0700960 slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200961 except ZeroDivisionError:
Miss Islington (bot)86779872021-05-24 18:11:12 -0700962 raise StatisticsError('x is constant')
Miss Islington (bot)8e3cb612021-05-06 08:26:55 -0700963 intercept = ybar - slope * xbar
Miss Islington (bot)86779872021-05-24 18:11:12 -0700964 return LinearRegression(slope=slope, intercept=intercept)
Tymoteusz Wołodźko09aa6f92021-04-25 13:45:09 +0200965
966
Raymond Hettinger11c79532019-02-23 14:44:07 -0800967## Normal Distribution #####################################################
968
Dong-hee Na0a18ee42019-08-24 07:20:30 +0900969
970def _normal_dist_inv_cdf(p, mu, sigma):
971 # There is no closed-form solution to the inverse CDF for the normal
972 # distribution, so we use a rational approximation instead:
973 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
974 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
975 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
976 q = p - 0.5
977 if fabs(q) <= 0.425:
978 r = 0.180625 - q * q
979 # Hash sum: 55.88319_28806_14901_4439
980 num = (((((((2.50908_09287_30122_6727e+3 * r +
981 3.34305_75583_58812_8105e+4) * r +
982 6.72657_70927_00870_0853e+4) * r +
983 4.59219_53931_54987_1457e+4) * r +
984 1.37316_93765_50946_1125e+4) * r +
985 1.97159_09503_06551_4427e+3) * r +
986 1.33141_66789_17843_7745e+2) * r +
987 3.38713_28727_96366_6080e+0) * q
988 den = (((((((5.22649_52788_52854_5610e+3 * r +
989 2.87290_85735_72194_2674e+4) * r +
990 3.93078_95800_09271_0610e+4) * r +
991 2.12137_94301_58659_5867e+4) * r +
992 5.39419_60214_24751_1077e+3) * r +
993 6.87187_00749_20579_0830e+2) * r +
994 4.23133_30701_60091_1252e+1) * r +
995 1.0)
996 x = num / den
997 return mu + (x * sigma)
998 r = p if q <= 0.0 else 1.0 - p
999 r = sqrt(-log(r))
1000 if r <= 5.0:
1001 r = r - 1.6
1002 # Hash sum: 49.33206_50330_16102_89036
1003 num = (((((((7.74545_01427_83414_07640e-4 * r +
1004 2.27238_44989_26918_45833e-2) * r +
1005 2.41780_72517_74506_11770e-1) * r +
1006 1.27045_82524_52368_38258e+0) * r +
1007 3.64784_83247_63204_60504e+0) * r +
1008 5.76949_72214_60691_40550e+0) * r +
1009 4.63033_78461_56545_29590e+0) * r +
1010 1.42343_71107_49683_57734e+0)
1011 den = (((((((1.05075_00716_44416_84324e-9 * r +
1012 5.47593_80849_95344_94600e-4) * r +
1013 1.51986_66563_61645_71966e-2) * r +
1014 1.48103_97642_74800_74590e-1) * r +
1015 6.89767_33498_51000_04550e-1) * r +
1016 1.67638_48301_83803_84940e+0) * r +
1017 2.05319_16266_37758_82187e+0) * r +
1018 1.0)
1019 else:
1020 r = r - 5.0
1021 # Hash sum: 47.52583_31754_92896_71629
1022 num = (((((((2.01033_43992_92288_13265e-7 * r +
1023 2.71155_55687_43487_57815e-5) * r +
1024 1.24266_09473_88078_43860e-3) * r +
1025 2.65321_89526_57612_30930e-2) * r +
1026 2.96560_57182_85048_91230e-1) * r +
1027 1.78482_65399_17291_33580e+0) * r +
1028 5.46378_49111_64114_36990e+0) * r +
1029 6.65790_46435_01103_77720e+0)
1030 den = (((((((2.04426_31033_89939_78564e-15 * r +
1031 1.42151_17583_16445_88870e-7) * r +
1032 1.84631_83175_10054_68180e-5) * r +
1033 7.86869_13114_56132_59100e-4) * r +
1034 1.48753_61290_85061_48525e-2) * r +
1035 1.36929_88092_27358_05310e-1) * r +
1036 5.99832_20655_58879_37690e-1) * r +
1037 1.0)
1038 x = num / den
1039 if q < 0.0:
1040 x = -x
1041 return mu + (x * sigma)
1042
1043
Raymond Hettinger0400a7f2020-05-02 19:30:24 -07001044# If available, use C implementation
1045try:
1046 from _statistics import _normal_dist_inv_cdf
1047except ImportError:
1048 pass
1049
1050
Raymond Hettinger11c79532019-02-23 14:44:07 -08001051class NormalDist:
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001052 "Normal distribution of a random variable"
Raymond Hettinger11c79532019-02-23 14:44:07 -08001053 # https://en.wikipedia.org/wiki/Normal_distribution
1054 # https://en.wikipedia.org/wiki/Variance#Properties
1055
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001056 __slots__ = {
1057 '_mu': 'Arithmetic mean of a normal distribution',
1058 '_sigma': 'Standard deviation of a normal distribution',
1059 }
Raymond Hettinger11c79532019-02-23 14:44:07 -08001060
1061 def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001062 "NormalDist where mu is the mean and sigma is the standard deviation."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001063 if sigma < 0.0:
1064 raise StatisticsError('sigma must be non-negative')
Raymond Hettingere4810b22019-09-05 00:18:47 -07001065 self._mu = float(mu)
1066 self._sigma = float(sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001067
1068 @classmethod
1069 def from_samples(cls, data):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001070 "Make a normal distribution instance from sample data."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001071 if not isinstance(data, (list, tuple)):
1072 data = list(data)
1073 xbar = fmean(data)
1074 return cls(xbar, stdev(data, xbar))
1075
Raymond Hettingerfb8c7d52019-04-23 01:46:18 -07001076 def samples(self, n, *, seed=None):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001077 "Generate *n* samples for a given mean and standard deviation."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001078 gauss = random.gauss if seed is None else random.Random(seed).gauss
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001079 mu, sigma = self._mu, self._sigma
Raymond Hettinger11c79532019-02-23 14:44:07 -08001080 return [gauss(mu, sigma) for i in range(n)]
1081
1082 def pdf(self, x):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001083 "Probability density function. P(x <= X < x+dx) / dx"
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001084 variance = self._sigma ** 2.0
Raymond Hettinger11c79532019-02-23 14:44:07 -08001085 if not variance:
1086 raise StatisticsError('pdf() not defined when sigma is zero')
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001087 return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001088
1089 def cdf(self, x):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001090 "Cumulative distribution function. P(X <= x)"
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001091 if not self._sigma:
Raymond Hettinger11c79532019-02-23 14:44:07 -08001092 raise StatisticsError('cdf() not defined when sigma is zero')
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001093 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001094
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001095 def inv_cdf(self, p):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001096 """Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001097
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001098 Finds the value of the random variable such that the probability of
1099 the variable being less than or equal to that value equals the given
1100 probability.
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001101
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001102 This function is also called the percent point function or quantile
1103 function.
1104 """
1105 if p <= 0.0 or p >= 1.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001106 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001107 if self._sigma <= 0.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001108 raise StatisticsError('cdf() not defined when sigma at or below zero')
Dong-hee Na0a18ee42019-08-24 07:20:30 +09001109 return _normal_dist_inv_cdf(p, self._mu, self._sigma)
Raymond Hettinger714c60d2019-03-18 20:17:14 -07001110
Raymond Hettinger4db25d52019-09-08 16:57:58 -07001111 def quantiles(self, n=4):
1112 """Divide into *n* continuous intervals with equal probability.
1113
1114 Returns a list of (n - 1) cut points separating the intervals.
1115
1116 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
1117 Set *n* to 100 for percentiles which gives the 99 cuts points that
1118 separate the normal distribution in to 100 equal sized groups.
1119 """
1120 return [self.inv_cdf(i / n) for i in range(1, n)]
1121
Raymond Hettinger318d5372019-03-06 22:59:40 -08001122 def overlap(self, other):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001123 """Compute the overlapping coefficient (OVL) between two normal distributions.
Raymond Hettinger318d5372019-03-06 22:59:40 -08001124
1125 Measures the agreement between two normal probability distributions.
1126 Returns a value between 0.0 and 1.0 giving the overlapping area in
1127 the two underlying probability density functions.
1128
1129 >>> N1 = NormalDist(2.4, 1.6)
1130 >>> N2 = NormalDist(3.2, 2.0)
1131 >>> N1.overlap(N2)
1132 0.8035050657330205
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001133 """
Raymond Hettinger318d5372019-03-06 22:59:40 -08001134 # See: "The overlapping coefficient as a measure of agreement between
1135 # probability distributions and point estimation of the overlap of two
1136 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1137 # http://dx.doi.org/10.1080/03610928908830127
1138 if not isinstance(other, NormalDist):
1139 raise TypeError('Expected another NormalDist instance')
1140 X, Y = self, other
Raymond Hettinger5aad0272020-06-13 19:17:28 -07001141 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
Raymond Hettinger318d5372019-03-06 22:59:40 -08001142 X, Y = Y, X
1143 X_var, Y_var = X.variance, Y.variance
1144 if not X_var or not Y_var:
1145 raise StatisticsError('overlap() not defined when sigma is zero')
1146 dv = Y_var - X_var
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001147 dm = fabs(Y._mu - X._mu)
Raymond Hettinger318d5372019-03-06 22:59:40 -08001148 if not dv:
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001149 return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
1150 a = X._mu * Y_var - Y._mu * X_var
1151 b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
Raymond Hettinger318d5372019-03-06 22:59:40 -08001152 x1 = (a + b) / dv
1153 x2 = (a - b) / dv
1154 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1155
Raymond Hettinger70f027d2020-04-16 10:25:14 -07001156 def zscore(self, x):
1157 """Compute the Standard Score. (x - mean) / stdev
1158
1159 Describes *x* in terms of the number of standard deviations
1160 above or below the mean of the normal distribution.
1161 """
1162 # https://www.statisticshowto.com/probability-and-statistics/z-score/
1163 if not self._sigma:
1164 raise StatisticsError('zscore() not defined when sigma is zero')
1165 return (x - self._mu) / self._sigma
1166
Raymond Hettinger11c79532019-02-23 14:44:07 -08001167 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001168 def mean(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001169 "Arithmetic mean of the normal distribution."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001170 return self._mu
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001171
1172 @property
Raymond Hettinger4db25d52019-09-08 16:57:58 -07001173 def median(self):
1174 "Return the median of the normal distribution"
1175 return self._mu
1176
1177 @property
1178 def mode(self):
1179 """Return the mode of the normal distribution
1180
1181 The mode is the value x where which the probability density
1182 function (pdf) takes its maximum value.
1183 """
1184 return self._mu
1185
1186 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001187 def stdev(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001188 "Standard deviation of the normal distribution."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001189 return self._sigma
Raymond Hettinger9e456bc2019-02-24 11:44:55 -08001190
1191 @property
Raymond Hettinger11c79532019-02-23 14:44:07 -08001192 def variance(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001193 "Square of the standard deviation."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001194 return self._sigma ** 2.0
Raymond Hettinger11c79532019-02-23 14:44:07 -08001195
1196 def __add__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001197 """Add a constant or another NormalDist instance.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001198
1199 If *other* is a constant, translate mu by the constant,
1200 leaving sigma unchanged.
1201
1202 If *other* is a NormalDist, add both the means and the variances.
1203 Mathematically, this works only if the two distributions are
1204 independent or if they are jointly normally distributed.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001205 """
Raymond Hettinger11c79532019-02-23 14:44:07 -08001206 if isinstance(x2, NormalDist):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001207 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1208 return NormalDist(x1._mu + x2, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001209
1210 def __sub__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001211 """Subtract a constant or another NormalDist instance.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001212
1213 If *other* is a constant, translate by the constant mu,
1214 leaving sigma unchanged.
1215
1216 If *other* is a NormalDist, subtract the means and add the variances.
1217 Mathematically, this works only if the two distributions are
1218 independent or if they are jointly normally distributed.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001219 """
Raymond Hettinger11c79532019-02-23 14:44:07 -08001220 if isinstance(x2, NormalDist):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001221 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1222 return NormalDist(x1._mu - x2, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001223
1224 def __mul__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001225 """Multiply both mu and sigma by a constant.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001226
1227 Used for rescaling, perhaps to change measurement units.
1228 Sigma is scaled with the absolute value of the constant.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001229 """
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001230 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001231
1232 def __truediv__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001233 """Divide both mu and sigma by a constant.
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -07001234
1235 Used for rescaling, perhaps to change measurement units.
1236 Sigma is scaled with the absolute value of the constant.
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001237 """
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001238 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001239
1240 def __pos__(x1):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001241 "Return a copy of the instance."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001242 return NormalDist(x1._mu, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001243
1244 def __neg__(x1):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001245 "Negates mu while keeping sigma the same."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001246 return NormalDist(-x1._mu, x1._sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -08001247
1248 __radd__ = __add__
1249
1250 def __rsub__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001251 "Subtract a NormalDist from a constant or another NormalDist."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001252 return -(x1 - x2)
1253
1254 __rmul__ = __mul__
1255
1256 def __eq__(x1, x2):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001257 "Two NormalDist objects are equal if their mu and sigma are both equal."
Raymond Hettinger11c79532019-02-23 14:44:07 -08001258 if not isinstance(x2, NormalDist):
1259 return NotImplemented
Raymond Hettinger5eabec02019-10-18 14:20:35 -07001260 return x1._mu == x2._mu and x1._sigma == x2._sigma
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001261
1262 def __hash__(self):
Raymond Hettinger1c0e9bb2019-07-21 12:13:07 -07001263 "NormalDist objects hash equal if their mu and sigma are both equal."
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001264 return hash((self._mu, self._sigma))
Raymond Hettinger11c79532019-02-23 14:44:07 -08001265
1266 def __repr__(self):
Raymond Hettinger02c91f52019-07-21 00:34:47 -07001267 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'