blob: bd8a6f96381a72e9d401445131903cb9728e4dcb [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== =============================================
11Function Description
12================== =============================================
13mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100014harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070015median Median (middle value) of data.
16median_low Low median of data.
17median_high High median of data.
18median_grouped Median, or 50th percentile, of grouped data.
19mode Mode (most common value) of data.
Raymond Hettingerfc06a192019-03-12 00:43:27 -070020multimode List of modes (most common values of data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -070021================== =============================================
22
23Calculate the arithmetic mean ("the average") of data:
24
25>>> mean([-1.0, 2.5, 3.25, 5.75])
262.625
27
28
29Calculate the standard median of discrete data:
30
31>>> median([2, 3, 4, 5])
323.5
33
34
35Calculate the median, or 50th percentile, of data grouped into class intervals
36centred on the data values provided. E.g. if your data points are rounded to
37the nearest whole number:
38
39>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
402.8333333333...
41
42This should be interpreted in this way: you have two data points in the class
43interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
44the class interval 3.5-4.5. The median of these data points is 2.8333...
45
46
47Calculating variability or spread
48---------------------------------
49
50================== =============================================
51Function Description
52================== =============================================
53pvariance Population variance of data.
54variance Sample variance of data.
55pstdev Population standard deviation of data.
56stdev Sample standard deviation of data.
57================== =============================================
58
59Calculate the standard deviation of sample data:
60
61>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
624.38961843444...
63
64If you have previously calculated the mean, you can pass it as the optional
65second argument to the four "spread" functions to avoid recalculating it:
66
67>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
68>>> mu = mean(data)
69>>> pvariance(data, mu)
702.5
71
72
73Exceptions
74----------
75
76A single exception is defined: StatisticsError is a subclass of ValueError.
77
78"""
79
Raymond Hettinger11c79532019-02-23 14:44:07 -080080__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070081 'pstdev', 'pvariance', 'stdev', 'variance',
82 'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettingerfc06a192019-03-12 00:43:27 -070083 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070084 ]
85
Larry Hastingsf5e987b2013-10-19 11:50:09 -070086import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +100087import numbers
Raymond Hettinger11c79532019-02-23 14:44:07 -080088import random
Larry Hastingsf5e987b2013-10-19 11:50:09 -070089
90from fractions import Fraction
91from decimal import Decimal
Victor Stinnerd6debb22017-03-27 16:05:26 +020092from itertools import groupby
Steven D'Aprano3b06e242016-05-05 03:54:29 +100093from bisect import bisect_left, bisect_right
Raymond Hettinger318d5372019-03-06 22:59:40 -080094from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettingerfc06a192019-03-12 00:43:27 -070095from operator import itemgetter
96from collections import Counter
Larry Hastingsf5e987b2013-10-19 11:50:09 -070097
98# === Exceptions ===
99
100class StatisticsError(ValueError):
101 pass
102
103
104# === Private utilities ===
105
106def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700108
Steven D'Apranob28c3272015-12-01 19:59:53 +1100109 Return a high-precision sum of the given numeric data as a fraction,
110 together with the type to be converted to and the count of items.
111
112 If optional argument ``start`` is given, it is added to the total.
113 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700114
115
116 Examples
117 --------
118
119 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700120 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700121
122 Some sources of round-off error will be avoided:
123
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000124 # Built-in sum returns zero.
125 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700126 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700127
128 Fractions and Decimals are also supported:
129
130 >>> from fractions import Fraction as F
131 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700132 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134 >>> from decimal import Decimal as D
135 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
136 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700137 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700138
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000139 Mixed types are currently treated as an error, except that int is
140 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100142 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100144 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700145 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100146 T = _coerce(int, type(start))
147 for typ, values in groupby(data, type):
148 T = _coerce(T, typ) # or raise TypeError
149 for n,d in map(_exact_ratio, values):
150 count += 1
151 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 # The sum will be a NAN or INF. We can ignore all the finite
154 # partials, and just look at this special one.
155 total = partials[None]
156 assert not _isfinite(total)
157 else:
158 # Sum all the partial sums using builtin sum.
159 # FIXME is this faster if we sum them in order of the denominator?
160 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
161 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700162
163
Steven D'Apranob28c3272015-12-01 19:59:53 +1100164def _isfinite(x):
165 try:
166 return x.is_finite() # Likely a Decimal.
167 except AttributeError:
168 return math.isfinite(x) # Coerces to float first.
169
170
171def _coerce(T, S):
172 """Coerce types T and S to a common type, or raise TypeError.
173
174 Coercion rules are currently an implementation detail. See the CoerceTest
175 test class in test_statistics for details.
176 """
177 # See http://bugs.python.org/issue24068.
178 assert T is not bool, "initial type T is bool"
179 # If the types are the same, no need to coerce anything. Put this
180 # first, so that the usual case (no coercion needed) happens as soon
181 # as possible.
182 if T is S: return T
183 # Mixed int & other coerce to the other type.
184 if S is int or S is bool: return T
185 if T is int: return S
186 # If one is a (strict) subclass of the other, coerce to the subclass.
187 if issubclass(S, T): return S
188 if issubclass(T, S): return T
189 # Ints coerce to the other type.
190 if issubclass(T, int): return S
191 if issubclass(S, int): return T
192 # Mixed fraction & float coerces to float (or float subclass).
193 if issubclass(T, Fraction) and issubclass(S, float):
194 return S
195 if issubclass(T, float) and issubclass(S, Fraction):
196 return T
197 # Any other combination is disallowed.
198 msg = "don't know how to coerce %s and %s"
199 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000200
201
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700202def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100203 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700204
205 >>> _exact_ratio(0.25)
206 (1, 4)
207
208 x is expected to be an int, Fraction, Decimal or float.
209 """
210 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100211 # Optimise the common case of floats. We expect that the most often
212 # used numeric type will be builtin floats, so try to make this as
213 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000214 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100215 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700216 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100217 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218 return (x.numerator, x.denominator)
219 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000221 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700222 return x.as_integer_ratio()
223 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000224 # Just give up?
225 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700226 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000228 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700229 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100230 msg = "can't convert type '{}' to numerator/denominator"
231 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232
233
Steven D'Apranob28c3272015-12-01 19:59:53 +1100234def _convert(value, T):
235 """Convert value to given numeric type T."""
236 if type(value) is T:
237 # This covers the cases where T is Fraction, or where value is
238 # a NAN or INF (Decimal or float).
239 return value
240 if issubclass(T, int) and value.denominator != 1:
241 T = float
242 try:
243 # FIXME: what do we do if this overflows?
244 return T(value)
245 except TypeError:
246 if issubclass(T, Decimal):
247 return T(value.numerator)/T(value.denominator)
248 else:
249 raise
250
251
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000252def _find_lteq(a, x):
253 'Locate the leftmost value exactly equal to x'
254 i = bisect_left(a, x)
255 if i != len(a) and a[i] == x:
256 return i
257 raise ValueError
258
259
260def _find_rteq(a, l, x):
261 'Locate the rightmost value exactly equal to x'
262 i = bisect_right(a, x, lo=l)
263 if i != (len(a)+1) and a[i-1] == x:
264 return i-1
265 raise ValueError
266
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000267
268def _fail_neg(values, errmsg='negative value'):
269 """Iterate over values, failing if any are less than zero."""
270 for x in values:
271 if x < 0:
272 raise StatisticsError(errmsg)
273 yield x
274
275
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700276# === Measures of central tendency (averages) ===
277
278def mean(data):
279 """Return the sample arithmetic mean of data.
280
281 >>> mean([1, 2, 3, 4, 4])
282 2.8
283
284 >>> from fractions import Fraction as F
285 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
286 Fraction(13, 21)
287
288 >>> from decimal import Decimal as D
289 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
290 Decimal('0.5625')
291
292 If ``data`` is empty, StatisticsError will be raised.
293 """
294 if iter(data) is data:
295 data = list(data)
296 n = len(data)
297 if n < 1:
298 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100299 T, total, count = _sum(data)
300 assert count == n
301 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700302
Raymond Hettinger47d99872019-02-21 15:06:29 -0800303def fmean(data):
304 """ Convert data to floats and compute the arithmetic mean.
305
306 This runs faster than the mean() function and it always returns a float.
307 The result is highly accurate but not as perfect as mean().
308 If the input dataset is empty, it raises a StatisticsError.
309
310 >>> fmean([3.5, 4.0, 5.25])
311 4.25
312
313 """
314 try:
315 n = len(data)
316 except TypeError:
317 # Handle iterators that do not define __len__().
318 n = 0
319 def count(x):
320 nonlocal n
321 n += 1
322 return x
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700323 total = fsum(map(count, data))
Raymond Hettinger47d99872019-02-21 15:06:29 -0800324 else:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700325 total = fsum(data)
Raymond Hettinger47d99872019-02-21 15:06:29 -0800326 try:
327 return total / n
328 except ZeroDivisionError:
329 raise StatisticsError('fmean requires at least one data point') from None
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700330
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000331def harmonic_mean(data):
332 """Return the harmonic mean of data.
333
334 The harmonic mean, sometimes called the subcontrary mean, is the
335 reciprocal of the arithmetic mean of the reciprocals of the data,
336 and is often appropriate when averaging quantities which are rates
337 or ratios, for example speeds. Example:
338
339 Suppose an investor purchases an equal value of shares in each of
340 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
341 What is the average P/E ratio for the investor's portfolio?
342
343 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
344 3.6
345
346 Using the arithmetic mean would give an average of about 5.167, which
347 is too high.
348
349 If ``data`` is empty, or any element is less than zero,
350 ``harmonic_mean`` will raise ``StatisticsError``.
351 """
352 # For a justification for using harmonic mean for P/E ratios, see
353 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
354 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
355 if iter(data) is data:
356 data = list(data)
357 errmsg = 'harmonic mean does not support negative values'
358 n = len(data)
359 if n < 1:
360 raise StatisticsError('harmonic_mean requires at least one data point')
361 elif n == 1:
362 x = data[0]
363 if isinstance(x, (numbers.Real, Decimal)):
364 if x < 0:
365 raise StatisticsError(errmsg)
366 return x
367 else:
368 raise TypeError('unsupported type')
369 try:
370 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
371 except ZeroDivisionError:
372 return 0
373 assert count == n
374 return _convert(n/total, T)
375
376
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700377# FIXME: investigate ways to calculate medians without sorting? Quickselect?
378def median(data):
379 """Return the median (middle value) of numeric data.
380
381 When the number of data points is odd, return the middle data point.
382 When the number of data points is even, the median is interpolated by
383 taking the average of the two middle values:
384
385 >>> median([1, 3, 5])
386 3
387 >>> median([1, 3, 5, 7])
388 4.0
389
390 """
391 data = sorted(data)
392 n = len(data)
393 if n == 0:
394 raise StatisticsError("no median for empty data")
395 if n%2 == 1:
396 return data[n//2]
397 else:
398 i = n//2
399 return (data[i - 1] + data[i])/2
400
401
402def median_low(data):
403 """Return the low median of numeric data.
404
405 When the number of data points is odd, the middle value is returned.
406 When it is even, the smaller of the two middle values is returned.
407
408 >>> median_low([1, 3, 5])
409 3
410 >>> median_low([1, 3, 5, 7])
411 3
412
413 """
414 data = sorted(data)
415 n = len(data)
416 if n == 0:
417 raise StatisticsError("no median for empty data")
418 if n%2 == 1:
419 return data[n//2]
420 else:
421 return data[n//2 - 1]
422
423
424def median_high(data):
425 """Return the high median of data.
426
427 When the number of data points is odd, the middle value is returned.
428 When it is even, the larger of the two middle values is returned.
429
430 >>> median_high([1, 3, 5])
431 3
432 >>> median_high([1, 3, 5, 7])
433 5
434
435 """
436 data = sorted(data)
437 n = len(data)
438 if n == 0:
439 raise StatisticsError("no median for empty data")
440 return data[n//2]
441
442
443def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500444 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700445
446 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
447 3.7
448 >>> median_grouped([52, 52, 53, 54])
449 52.5
450
451 This calculates the median as the 50th percentile, and should be
452 used when your data is continuous and grouped. In the above example,
453 the values 1, 2, 3, etc. actually represent the midpoint of classes
454 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
455 class 3.5-4.5, and interpolation is used to estimate it.
456
457 Optional argument ``interval`` represents the class interval, and
458 defaults to 1. Changing the class interval naturally will change the
459 interpolated 50th percentile value:
460
461 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
462 3.25
463 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
464 3.5
465
466 This function does not check whether the data points are at least
467 ``interval`` apart.
468 """
469 data = sorted(data)
470 n = len(data)
471 if n == 0:
472 raise StatisticsError("no median for empty data")
473 elif n == 1:
474 return data[0]
475 # Find the value at the midpoint. Remember this corresponds to the
476 # centre of the class interval.
477 x = data[n//2]
478 for obj in (x, interval):
479 if isinstance(obj, (str, bytes)):
480 raise TypeError('expected number but got %r' % obj)
481 try:
482 L = x - interval/2 # The lower limit of the median interval.
483 except TypeError:
484 # Mixed type. For now we just coerce to float.
485 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000486
487 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000488 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000489 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000490 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000491 # Assuming always l1 <= l2
492 l2 = _find_rteq(data, l1, x)
493 cf = l1
494 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700495 return L + interval*(n/2 - cf)/f
496
497
498def mode(data):
499 """Return the most common data point from discrete or nominal data.
500
501 ``mode`` assumes discrete data, and returns a single value. This is the
502 standard treatment of the mode as commonly taught in schools:
503
504 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
505 3
506
507 This also works with nominal (non-numeric) data:
508
509 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
510 'red'
511
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700512 If there are multiple modes, return the first one encountered.
513
514 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
515 'red'
516
517 If *data* is empty, ``mode``, raises StatisticsError.
518
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700519 """
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700520 data = iter(data)
521 try:
522 return Counter(data).most_common(1)[0][0]
523 except IndexError:
524 raise StatisticsError('no mode for empty data') from None
525
526
527def multimode(data):
528 """ Return a list of the most frequently occurring values.
529
530 Will return more than one result if there are multiple modes
531 or an empty list if *data* is empty.
532
533 >>> multimode('aabbbbbbbbcc')
534 ['b']
535 >>> multimode('aabbbbccddddeeffffgg')
536 ['b', 'd', 'f']
537 >>> multimode('')
538 []
539
540 """
541 counts = Counter(iter(data)).most_common()
542 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
543 return list(map(itemgetter(0), mode_items))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700544
545
546# === Measures of spread ===
547
548# See http://mathworld.wolfram.com/Variance.html
549# http://mathworld.wolfram.com/SampleVariance.html
550# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
551#
552# Under no circumstances use the so-called "computational formula for
553# variance", as that is only suitable for hand calculations with a small
554# amount of low-precision data. It has terrible numeric properties.
555#
556# See a comparison of three computational methods here:
557# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
558
559def _ss(data, c=None):
560 """Return sum of square deviations of sequence data.
561
562 If ``c`` is None, the mean is calculated in one pass, and the deviations
563 from the mean are calculated in a second pass. Otherwise, deviations are
564 calculated from ``c`` as given. Use the second case with care, as it can
565 lead to garbage results.
566 """
567 if c is None:
568 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100569 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700570 # The following sum should mathematically equal zero, but due to rounding
571 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100572 U, total2, count2 = _sum((x-c) for x in data)
573 assert T == U and count == count2
574 total -= total2**2/len(data)
575 assert not total < 0, 'negative sum of square deviations: %f' % total
576 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700577
578
579def variance(data, xbar=None):
580 """Return the sample variance of data.
581
582 data should be an iterable of Real-valued numbers, with at least two
583 values. The optional argument xbar, if given, should be the mean of
584 the data. If it is missing or None, the mean is automatically calculated.
585
586 Use this function when your data is a sample from a population. To
587 calculate the variance from the entire population, see ``pvariance``.
588
589 Examples:
590
591 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
592 >>> variance(data)
593 1.3720238095238095
594
595 If you have already calculated the mean of your data, you can pass it as
596 the optional second argument ``xbar`` to avoid recalculating it:
597
598 >>> m = mean(data)
599 >>> variance(data, m)
600 1.3720238095238095
601
602 This function does not check that ``xbar`` is actually the mean of
603 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
604 impossible results.
605
606 Decimals and Fractions are supported:
607
608 >>> from decimal import Decimal as D
609 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
610 Decimal('31.01875')
611
612 >>> from fractions import Fraction as F
613 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
614 Fraction(67, 108)
615
616 """
617 if iter(data) is data:
618 data = list(data)
619 n = len(data)
620 if n < 2:
621 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100622 T, ss = _ss(data, xbar)
623 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700624
625
626def pvariance(data, mu=None):
627 """Return the population variance of ``data``.
628
629 data should be an iterable of Real-valued numbers, with at least one
630 value. The optional argument mu, if given, should be the mean of
631 the data. If it is missing or None, the mean is automatically calculated.
632
633 Use this function to calculate the variance from the entire population.
634 To estimate the variance from a sample, the ``variance`` function is
635 usually a better choice.
636
637 Examples:
638
639 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
640 >>> pvariance(data)
641 1.25
642
643 If you have already calculated the mean of the data, you can pass it as
644 the optional second argument to avoid recalculating it:
645
646 >>> mu = mean(data)
647 >>> pvariance(data, mu)
648 1.25
649
650 This function does not check that ``mu`` is actually the mean of ``data``.
651 Giving arbitrary values for ``mu`` may lead to invalid or impossible
652 results.
653
654 Decimals and Fractions are supported:
655
656 >>> from decimal import Decimal as D
657 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
658 Decimal('24.815')
659
660 >>> from fractions import Fraction as F
661 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
662 Fraction(13, 72)
663
664 """
665 if iter(data) is data:
666 data = list(data)
667 n = len(data)
668 if n < 1:
669 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100670 T, ss = _ss(data, mu)
671 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700672
673
674def stdev(data, xbar=None):
675 """Return the square root of the sample variance.
676
677 See ``variance`` for arguments and other details.
678
679 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
680 1.0810874155219827
681
682 """
683 var = variance(data, xbar)
684 try:
685 return var.sqrt()
686 except AttributeError:
687 return math.sqrt(var)
688
689
690def pstdev(data, mu=None):
691 """Return the square root of the population variance.
692
693 See ``pvariance`` for arguments and other details.
694
695 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
696 0.986893273527251
697
698 """
699 var = pvariance(data, mu)
700 try:
701 return var.sqrt()
702 except AttributeError:
703 return math.sqrt(var)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800704
705## Normal Distribution #####################################################
706
707class NormalDist:
708 'Normal distribution of a random variable'
709 # https://en.wikipedia.org/wiki/Normal_distribution
710 # https://en.wikipedia.org/wiki/Variance#Properties
711
Raymond Hettingerd1e768a2019-03-25 13:01:13 -0700712 __slots__ = {'mu': 'Arithmetic mean of a normal distribution',
713 'sigma': 'Standard deviation of a normal distribution'}
Raymond Hettinger11c79532019-02-23 14:44:07 -0800714
715 def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700716 'NormalDist where mu is the mean and sigma is the standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800717 if sigma < 0.0:
718 raise StatisticsError('sigma must be non-negative')
719 self.mu = mu
720 self.sigma = sigma
721
722 @classmethod
723 def from_samples(cls, data):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700724 'Make a normal distribution instance from sample data.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800725 if not isinstance(data, (list, tuple)):
726 data = list(data)
727 xbar = fmean(data)
728 return cls(xbar, stdev(data, xbar))
729
730 def samples(self, n, seed=None):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700731 'Generate *n* samples for a given mean and standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800732 gauss = random.gauss if seed is None else random.Random(seed).gauss
733 mu, sigma = self.mu, self.sigma
734 return [gauss(mu, sigma) for i in range(n)]
735
736 def pdf(self, x):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700737 'Probability density function. P(x <= X < x+dx) / dx'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800738 variance = self.sigma ** 2.0
739 if not variance:
740 raise StatisticsError('pdf() not defined when sigma is zero')
741 return exp((x - self.mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance)
742
743 def cdf(self, x):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700744 'Cumulative distribution function. P(X <= x)'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800745 if not self.sigma:
746 raise StatisticsError('cdf() not defined when sigma is zero')
747 return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
748
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700749 def inv_cdf(self, p):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700750 '''Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700751
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700752 Finds the value of the random variable such that the probability of the
753 variable being less than or equal to that value equals the given probability.
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700754
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700755 This function is also called the percent point function or quantile function.
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700756 '''
757 if (p <= 0.0 or p >= 1.0):
758 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
759 if self.sigma <= 0.0:
760 raise StatisticsError('cdf() not defined when sigma at or below zero')
761
762 # There is no closed-form solution to the inverse CDF for the normal
763 # distribution, so we use a rational approximation instead:
764 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
765 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
766 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
767
768 q = p - 0.5
769 if fabs(q) <= 0.425:
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700770 r = 0.180625 - q * q
Raymond Hettingerfe138832019-03-19 14:29:13 -0700771 num = (((((((2.50908_09287_30122_6727e+3 * r +
772 3.34305_75583_58812_8105e+4) * r +
773 6.72657_70927_00870_0853e+4) * r +
774 4.59219_53931_54987_1457e+4) * r +
775 1.37316_93765_50946_1125e+4) * r +
776 1.97159_09503_06551_4427e+3) * r +
777 1.33141_66789_17843_7745e+2) * r +
778 3.38713_28727_96366_6080e+0) * q
779 den = (((((((5.22649_52788_52854_5610e+3 * r +
780 2.87290_85735_72194_2674e+4) * r +
781 3.93078_95800_09271_0610e+4) * r +
782 2.12137_94301_58659_5867e+4) * r +
783 5.39419_60214_24751_1077e+3) * r +
784 6.87187_00749_20579_0830e+2) * r +
785 4.23133_30701_60091_1252e+1) * r +
786 1.0)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700787 x = num / den
788 return self.mu + (x * self.sigma)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700789 r = p if q <= 0.0 else 1.0 - p
790 r = sqrt(-log(r))
791 if r <= 5.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700792 r = r - 1.6
Raymond Hettingerfe138832019-03-19 14:29:13 -0700793 num = (((((((7.74545_01427_83414_07640e-4 * r +
794 2.27238_44989_26918_45833e-2) * r +
795 2.41780_72517_74506_11770e-1) * r +
796 1.27045_82524_52368_38258e+0) * r +
797 3.64784_83247_63204_60504e+0) * r +
798 5.76949_72214_60691_40550e+0) * r +
799 4.63033_78461_56545_29590e+0) * r +
800 1.42343_71107_49683_57734e+0)
801 den = (((((((1.05075_00716_44416_84324e-9 * r +
802 5.47593_80849_95344_94600e-4) * r +
803 1.51986_66563_61645_71966e-2) * r +
804 1.48103_97642_74800_74590e-1) * r +
805 6.89767_33498_51000_04550e-1) * r +
806 1.67638_48301_83803_84940e+0) * r +
807 2.05319_16266_37758_82187e+0) * r +
808 1.0)
Raymond Hettinger52a594b2019-03-19 12:48:04 -0700809 else:
810 r = r - 5.0
Raymond Hettingerfe138832019-03-19 14:29:13 -0700811 num = (((((((2.01033_43992_92288_13265e-7 * r +
812 2.71155_55687_43487_57815e-5) * r +
813 1.24266_09473_88078_43860e-3) * r +
814 2.65321_89526_57612_30930e-2) * r +
815 2.96560_57182_85048_91230e-1) * r +
816 1.78482_65399_17291_33580e+0) * r +
817 5.46378_49111_64114_36990e+0) * r +
818 6.65790_46435_01103_77720e+0)
819 den = (((((((2.04426_31033_89939_78564e-15 * r +
820 1.42151_17583_16445_88870e-7) * r +
821 1.84631_83175_10054_68180e-5) * r +
822 7.86869_13114_56132_59100e-4) * r +
823 1.48753_61290_85061_48525e-2) * r +
824 1.36929_88092_27358_05310e-1) * r +
825 5.99832_20655_58879_37690e-1) * r +
826 1.0)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700827 x = num / den
828 if q < 0.0:
829 x = -x
830 return self.mu + (x * self.sigma)
831
Raymond Hettinger318d5372019-03-06 22:59:40 -0800832 def overlap(self, other):
833 '''Compute the overlapping coefficient (OVL) between two normal distributions.
834
835 Measures the agreement between two normal probability distributions.
836 Returns a value between 0.0 and 1.0 giving the overlapping area in
837 the two underlying probability density functions.
838
839 >>> N1 = NormalDist(2.4, 1.6)
840 >>> N2 = NormalDist(3.2, 2.0)
841 >>> N1.overlap(N2)
842 0.8035050657330205
Raymond Hettinger318d5372019-03-06 22:59:40 -0800843 '''
844 # See: "The overlapping coefficient as a measure of agreement between
845 # probability distributions and point estimation of the overlap of two
846 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
847 # http://dx.doi.org/10.1080/03610928908830127
848 if not isinstance(other, NormalDist):
849 raise TypeError('Expected another NormalDist instance')
850 X, Y = self, other
851 if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
852 X, Y = Y, X
853 X_var, Y_var = X.variance, Y.variance
854 if not X_var or not Y_var:
855 raise StatisticsError('overlap() not defined when sigma is zero')
856 dv = Y_var - X_var
857 dm = fabs(Y.mu - X.mu)
858 if not dv:
Raymond Hettinger41f0b782019-03-14 02:25:26 -0700859 return 1.0 - erf(dm / (2.0 * X.sigma * sqrt(2.0)))
Raymond Hettinger318d5372019-03-06 22:59:40 -0800860 a = X.mu * Y_var - Y.mu * X_var
861 b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
862 x1 = (a + b) / dv
863 x2 = (a - b) / dv
864 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
865
Raymond Hettinger11c79532019-02-23 14:44:07 -0800866 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800867 def mean(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700868 'Arithmetic mean of the normal distribution.'
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800869 return self.mu
870
871 @property
872 def stdev(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700873 'Standard deviation of the normal distribution.'
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800874 return self.sigma
875
876 @property
Raymond Hettinger11c79532019-02-23 14:44:07 -0800877 def variance(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700878 'Square of the standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800879 return self.sigma ** 2.0
880
881 def __add__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700882 '''Add a constant or another NormalDist instance.
883
884 If *other* is a constant, translate mu by the constant,
885 leaving sigma unchanged.
886
887 If *other* is a NormalDist, add both the means and the variances.
888 Mathematically, this works only if the two distributions are
889 independent or if they are jointly normally distributed.
890 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800891 if isinstance(x2, NormalDist):
892 return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
893 return NormalDist(x1.mu + x2, x1.sigma)
894
895 def __sub__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700896 '''Subtract a constant or another NormalDist instance.
897
898 If *other* is a constant, translate by the constant mu,
899 leaving sigma unchanged.
900
901 If *other* is a NormalDist, subtract the means and add the variances.
902 Mathematically, this works only if the two distributions are
903 independent or if they are jointly normally distributed.
904 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800905 if isinstance(x2, NormalDist):
906 return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
907 return NormalDist(x1.mu - x2, x1.sigma)
908
909 def __mul__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700910 '''Multiply both mu and sigma by a constant.
911
912 Used for rescaling, perhaps to change measurement units.
913 Sigma is scaled with the absolute value of the constant.
914 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800915 return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
916
917 def __truediv__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700918 '''Divide both mu and sigma by a constant.
919
920 Used for rescaling, perhaps to change measurement units.
921 Sigma is scaled with the absolute value of the constant.
922 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800923 return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
924
925 def __pos__(x1):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700926 'Return a copy of the instance.'
Raymond Hettinger79fbcc52019-02-23 22:19:01 -0800927 return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800928
929 def __neg__(x1):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700930 'Negates mu while keeping sigma the same.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800931 return NormalDist(-x1.mu, x1.sigma)
932
933 __radd__ = __add__
934
935 def __rsub__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700936 'Subtract a NormalDist from a constant or another NormalDist.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800937 return -(x1 - x2)
938
939 __rmul__ = __mul__
940
941 def __eq__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700942 'Two NormalDist objects are equal if their mu and sigma are both equal.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800943 if not isinstance(x2, NormalDist):
944 return NotImplemented
945 return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
946
947 def __repr__(self):
948 return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
949
950
951if __name__ == '__main__':
952
953 # Show math operations computed analytically in comparsion
954 # to a monte carlo simulation of the same operations
955
956 from math import isclose
957 from operator import add, sub, mul, truediv
958 from itertools import repeat
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700959 import doctest
Raymond Hettinger11c79532019-02-23 14:44:07 -0800960
961 g1 = NormalDist(10, 20)
962 g2 = NormalDist(-5, 25)
963
964 # Test scaling by a constant
965 assert (g1 * 5 / 5).mu == g1.mu
966 assert (g1 * 5 / 5).sigma == g1.sigma
967
968 n = 100_000
969 G1 = g1.samples(n)
970 G2 = g2.samples(n)
971
972 for func in (add, sub):
973 print(f'\nTest {func.__name__} with another NormalDist:')
974 print(func(g1, g2))
975 print(NormalDist.from_samples(map(func, G1, G2)))
976
977 const = 11
978 for func in (add, sub, mul, truediv):
979 print(f'\nTest {func.__name__} with a constant:')
980 print(func(g1, const))
981 print(NormalDist.from_samples(map(func, G1, repeat(const))))
982
983 const = 19
984 for func in (add, sub, mul):
985 print(f'\nTest constant with {func.__name__}:')
986 print(func(const, g1))
987 print(NormalDist.from_samples(map(func, repeat(const), G1)))
988
989 def assert_close(G1, G2):
990 assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
991 assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
992
993 X = NormalDist(-105, 73)
994 Y = NormalDist(31, 47)
995 s = 32.75
996 n = 100_000
997
998 S = NormalDist.from_samples([x + s for x in X.samples(n)])
999 assert_close(X + s, S)
1000
1001 S = NormalDist.from_samples([x - s for x in X.samples(n)])
1002 assert_close(X - s, S)
1003
1004 S = NormalDist.from_samples([x * s for x in X.samples(n)])
1005 assert_close(X * s, S)
1006
1007 S = NormalDist.from_samples([x / s for x in X.samples(n)])
1008 assert_close(X / s, S)
1009
1010 S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
1011 Y.samples(n))])
1012 assert_close(X + Y, S)
1013
1014 S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
1015 Y.samples(n))])
1016 assert_close(X - Y, S)
Raymond Hettingerfc06a192019-03-12 00:43:27 -07001017
1018 print(doctest.testmod())