blob: e5a62463f0154e701266f2001b0e72e3f7da8b0b [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== =============================================
11Function Description
12================== =============================================
13mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100014harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070015median Median (middle value) of data.
16median_low Low median of data.
17median_high High median of data.
18median_grouped Median, or 50th percentile, of grouped data.
19mode Mode (most common value) of data.
Raymond Hettingerfc06a192019-03-12 00:43:27 -070020multimode List of modes (most common values of data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -070021================== =============================================
22
23Calculate the arithmetic mean ("the average") of data:
24
25>>> mean([-1.0, 2.5, 3.25, 5.75])
262.625
27
28
29Calculate the standard median of discrete data:
30
31>>> median([2, 3, 4, 5])
323.5
33
34
35Calculate the median, or 50th percentile, of data grouped into class intervals
36centred on the data values provided. E.g. if your data points are rounded to
37the nearest whole number:
38
39>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
402.8333333333...
41
42This should be interpreted in this way: you have two data points in the class
43interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
44the class interval 3.5-4.5. The median of these data points is 2.8333...
45
46
47Calculating variability or spread
48---------------------------------
49
50================== =============================================
51Function Description
52================== =============================================
53pvariance Population variance of data.
54variance Sample variance of data.
55pstdev Population standard deviation of data.
56stdev Sample standard deviation of data.
57================== =============================================
58
59Calculate the standard deviation of sample data:
60
61>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
624.38961843444...
63
64If you have previously calculated the mean, you can pass it as the optional
65second argument to the four "spread" functions to avoid recalculating it:
66
67>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
68>>> mu = mean(data)
69>>> pvariance(data, mu)
702.5
71
72
73Exceptions
74----------
75
76A single exception is defined: StatisticsError is a subclass of ValueError.
77
78"""
79
Raymond Hettinger11c79532019-02-23 14:44:07 -080080__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070081 'pstdev', 'pvariance', 'stdev', 'variance',
82 'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettingerfc06a192019-03-12 00:43:27 -070083 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070084 ]
85
Larry Hastingsf5e987b2013-10-19 11:50:09 -070086import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +100087import numbers
Raymond Hettinger11c79532019-02-23 14:44:07 -080088import random
Larry Hastingsf5e987b2013-10-19 11:50:09 -070089
90from fractions import Fraction
91from decimal import Decimal
Victor Stinnerd6debb22017-03-27 16:05:26 +020092from itertools import groupby
Steven D'Aprano3b06e242016-05-05 03:54:29 +100093from bisect import bisect_left, bisect_right
Raymond Hettinger318d5372019-03-06 22:59:40 -080094from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
Raymond Hettingerfc06a192019-03-12 00:43:27 -070095from operator import itemgetter
96from collections import Counter
Larry Hastingsf5e987b2013-10-19 11:50:09 -070097
98# === Exceptions ===
99
100class StatisticsError(ValueError):
101 pass
102
103
104# === Private utilities ===
105
106def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700108
Steven D'Apranob28c3272015-12-01 19:59:53 +1100109 Return a high-precision sum of the given numeric data as a fraction,
110 together with the type to be converted to and the count of items.
111
112 If optional argument ``start`` is given, it is added to the total.
113 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700114
115
116 Examples
117 --------
118
119 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700120 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700121
122 Some sources of round-off error will be avoided:
123
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000124 # Built-in sum returns zero.
125 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700126 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700127
128 Fractions and Decimals are also supported:
129
130 >>> from fractions import Fraction as F
131 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700132 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134 >>> from decimal import Decimal as D
135 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
136 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700137 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700138
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000139 Mixed types are currently treated as an error, except that int is
140 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100142 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100144 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700145 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100146 T = _coerce(int, type(start))
147 for typ, values in groupby(data, type):
148 T = _coerce(T, typ) # or raise TypeError
149 for n,d in map(_exact_ratio, values):
150 count += 1
151 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 # The sum will be a NAN or INF. We can ignore all the finite
154 # partials, and just look at this special one.
155 total = partials[None]
156 assert not _isfinite(total)
157 else:
158 # Sum all the partial sums using builtin sum.
159 # FIXME is this faster if we sum them in order of the denominator?
160 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
161 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700162
163
Steven D'Apranob28c3272015-12-01 19:59:53 +1100164def _isfinite(x):
165 try:
166 return x.is_finite() # Likely a Decimal.
167 except AttributeError:
168 return math.isfinite(x) # Coerces to float first.
169
170
171def _coerce(T, S):
172 """Coerce types T and S to a common type, or raise TypeError.
173
174 Coercion rules are currently an implementation detail. See the CoerceTest
175 test class in test_statistics for details.
176 """
177 # See http://bugs.python.org/issue24068.
178 assert T is not bool, "initial type T is bool"
179 # If the types are the same, no need to coerce anything. Put this
180 # first, so that the usual case (no coercion needed) happens as soon
181 # as possible.
182 if T is S: return T
183 # Mixed int & other coerce to the other type.
184 if S is int or S is bool: return T
185 if T is int: return S
186 # If one is a (strict) subclass of the other, coerce to the subclass.
187 if issubclass(S, T): return S
188 if issubclass(T, S): return T
189 # Ints coerce to the other type.
190 if issubclass(T, int): return S
191 if issubclass(S, int): return T
192 # Mixed fraction & float coerces to float (or float subclass).
193 if issubclass(T, Fraction) and issubclass(S, float):
194 return S
195 if issubclass(T, float) and issubclass(S, Fraction):
196 return T
197 # Any other combination is disallowed.
198 msg = "don't know how to coerce %s and %s"
199 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000200
201
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700202def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100203 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700204
205 >>> _exact_ratio(0.25)
206 (1, 4)
207
208 x is expected to be an int, Fraction, Decimal or float.
209 """
210 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100211 # Optimise the common case of floats. We expect that the most often
212 # used numeric type will be builtin floats, so try to make this as
213 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000214 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100215 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700216 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100217 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218 return (x.numerator, x.denominator)
219 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000221 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700222 return x.as_integer_ratio()
223 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000224 # Just give up?
225 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700226 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000228 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700229 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100230 msg = "can't convert type '{}' to numerator/denominator"
231 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232
233
Steven D'Apranob28c3272015-12-01 19:59:53 +1100234def _convert(value, T):
235 """Convert value to given numeric type T."""
236 if type(value) is T:
237 # This covers the cases where T is Fraction, or where value is
238 # a NAN or INF (Decimal or float).
239 return value
240 if issubclass(T, int) and value.denominator != 1:
241 T = float
242 try:
243 # FIXME: what do we do if this overflows?
244 return T(value)
245 except TypeError:
246 if issubclass(T, Decimal):
247 return T(value.numerator)/T(value.denominator)
248 else:
249 raise
250
251
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000252def _find_lteq(a, x):
253 'Locate the leftmost value exactly equal to x'
254 i = bisect_left(a, x)
255 if i != len(a) and a[i] == x:
256 return i
257 raise ValueError
258
259
260def _find_rteq(a, l, x):
261 'Locate the rightmost value exactly equal to x'
262 i = bisect_right(a, x, lo=l)
263 if i != (len(a)+1) and a[i-1] == x:
264 return i-1
265 raise ValueError
266
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000267
268def _fail_neg(values, errmsg='negative value'):
269 """Iterate over values, failing if any are less than zero."""
270 for x in values:
271 if x < 0:
272 raise StatisticsError(errmsg)
273 yield x
274
275
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700276# === Measures of central tendency (averages) ===
277
278def mean(data):
279 """Return the sample arithmetic mean of data.
280
281 >>> mean([1, 2, 3, 4, 4])
282 2.8
283
284 >>> from fractions import Fraction as F
285 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
286 Fraction(13, 21)
287
288 >>> from decimal import Decimal as D
289 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
290 Decimal('0.5625')
291
292 If ``data`` is empty, StatisticsError will be raised.
293 """
294 if iter(data) is data:
295 data = list(data)
296 n = len(data)
297 if n < 1:
298 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100299 T, total, count = _sum(data)
300 assert count == n
301 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700302
Raymond Hettinger47d99872019-02-21 15:06:29 -0800303def fmean(data):
304 """ Convert data to floats and compute the arithmetic mean.
305
306 This runs faster than the mean() function and it always returns a float.
307 The result is highly accurate but not as perfect as mean().
308 If the input dataset is empty, it raises a StatisticsError.
309
310 >>> fmean([3.5, 4.0, 5.25])
311 4.25
312
313 """
314 try:
315 n = len(data)
316 except TypeError:
317 # Handle iterators that do not define __len__().
318 n = 0
319 def count(x):
320 nonlocal n
321 n += 1
322 return x
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700323 total = fsum(map(count, data))
Raymond Hettinger47d99872019-02-21 15:06:29 -0800324 else:
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700325 total = fsum(data)
Raymond Hettinger47d99872019-02-21 15:06:29 -0800326 try:
327 return total / n
328 except ZeroDivisionError:
329 raise StatisticsError('fmean requires at least one data point') from None
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700330
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000331def harmonic_mean(data):
332 """Return the harmonic mean of data.
333
334 The harmonic mean, sometimes called the subcontrary mean, is the
335 reciprocal of the arithmetic mean of the reciprocals of the data,
336 and is often appropriate when averaging quantities which are rates
337 or ratios, for example speeds. Example:
338
339 Suppose an investor purchases an equal value of shares in each of
340 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
341 What is the average P/E ratio for the investor's portfolio?
342
343 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
344 3.6
345
346 Using the arithmetic mean would give an average of about 5.167, which
347 is too high.
348
349 If ``data`` is empty, or any element is less than zero,
350 ``harmonic_mean`` will raise ``StatisticsError``.
351 """
352 # For a justification for using harmonic mean for P/E ratios, see
353 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
354 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
355 if iter(data) is data:
356 data = list(data)
357 errmsg = 'harmonic mean does not support negative values'
358 n = len(data)
359 if n < 1:
360 raise StatisticsError('harmonic_mean requires at least one data point')
361 elif n == 1:
362 x = data[0]
363 if isinstance(x, (numbers.Real, Decimal)):
364 if x < 0:
365 raise StatisticsError(errmsg)
366 return x
367 else:
368 raise TypeError('unsupported type')
369 try:
370 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
371 except ZeroDivisionError:
372 return 0
373 assert count == n
374 return _convert(n/total, T)
375
376
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700377# FIXME: investigate ways to calculate medians without sorting? Quickselect?
378def median(data):
379 """Return the median (middle value) of numeric data.
380
381 When the number of data points is odd, return the middle data point.
382 When the number of data points is even, the median is interpolated by
383 taking the average of the two middle values:
384
385 >>> median([1, 3, 5])
386 3
387 >>> median([1, 3, 5, 7])
388 4.0
389
390 """
391 data = sorted(data)
392 n = len(data)
393 if n == 0:
394 raise StatisticsError("no median for empty data")
395 if n%2 == 1:
396 return data[n//2]
397 else:
398 i = n//2
399 return (data[i - 1] + data[i])/2
400
401
402def median_low(data):
403 """Return the low median of numeric data.
404
405 When the number of data points is odd, the middle value is returned.
406 When it is even, the smaller of the two middle values is returned.
407
408 >>> median_low([1, 3, 5])
409 3
410 >>> median_low([1, 3, 5, 7])
411 3
412
413 """
414 data = sorted(data)
415 n = len(data)
416 if n == 0:
417 raise StatisticsError("no median for empty data")
418 if n%2 == 1:
419 return data[n//2]
420 else:
421 return data[n//2 - 1]
422
423
424def median_high(data):
425 """Return the high median of data.
426
427 When the number of data points is odd, the middle value is returned.
428 When it is even, the larger of the two middle values is returned.
429
430 >>> median_high([1, 3, 5])
431 3
432 >>> median_high([1, 3, 5, 7])
433 5
434
435 """
436 data = sorted(data)
437 n = len(data)
438 if n == 0:
439 raise StatisticsError("no median for empty data")
440 return data[n//2]
441
442
443def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500444 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700445
446 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
447 3.7
448 >>> median_grouped([52, 52, 53, 54])
449 52.5
450
451 This calculates the median as the 50th percentile, and should be
452 used when your data is continuous and grouped. In the above example,
453 the values 1, 2, 3, etc. actually represent the midpoint of classes
454 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
455 class 3.5-4.5, and interpolation is used to estimate it.
456
457 Optional argument ``interval`` represents the class interval, and
458 defaults to 1. Changing the class interval naturally will change the
459 interpolated 50th percentile value:
460
461 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
462 3.25
463 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
464 3.5
465
466 This function does not check whether the data points are at least
467 ``interval`` apart.
468 """
469 data = sorted(data)
470 n = len(data)
471 if n == 0:
472 raise StatisticsError("no median for empty data")
473 elif n == 1:
474 return data[0]
475 # Find the value at the midpoint. Remember this corresponds to the
476 # centre of the class interval.
477 x = data[n//2]
478 for obj in (x, interval):
479 if isinstance(obj, (str, bytes)):
480 raise TypeError('expected number but got %r' % obj)
481 try:
482 L = x - interval/2 # The lower limit of the median interval.
483 except TypeError:
484 # Mixed type. For now we just coerce to float.
485 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000486
487 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000488 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000489 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000490 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000491 # Assuming always l1 <= l2
492 l2 = _find_rteq(data, l1, x)
493 cf = l1
494 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700495 return L + interval*(n/2 - cf)/f
496
497
498def mode(data):
499 """Return the most common data point from discrete or nominal data.
500
501 ``mode`` assumes discrete data, and returns a single value. This is the
502 standard treatment of the mode as commonly taught in schools:
503
504 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
505 3
506
507 This also works with nominal (non-numeric) data:
508
509 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
510 'red'
511
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700512 If there are multiple modes, return the first one encountered.
513
514 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
515 'red'
516
517 If *data* is empty, ``mode``, raises StatisticsError.
518
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700519 """
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700520 data = iter(data)
521 try:
522 return Counter(data).most_common(1)[0][0]
523 except IndexError:
524 raise StatisticsError('no mode for empty data') from None
525
526
527def multimode(data):
528 """ Return a list of the most frequently occurring values.
529
530 Will return more than one result if there are multiple modes
531 or an empty list if *data* is empty.
532
533 >>> multimode('aabbbbbbbbcc')
534 ['b']
535 >>> multimode('aabbbbccddddeeffffgg')
536 ['b', 'd', 'f']
537 >>> multimode('')
538 []
539
540 """
541 counts = Counter(iter(data)).most_common()
542 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
543 return list(map(itemgetter(0), mode_items))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700544
545
546# === Measures of spread ===
547
548# See http://mathworld.wolfram.com/Variance.html
549# http://mathworld.wolfram.com/SampleVariance.html
550# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
551#
552# Under no circumstances use the so-called "computational formula for
553# variance", as that is only suitable for hand calculations with a small
554# amount of low-precision data. It has terrible numeric properties.
555#
556# See a comparison of three computational methods here:
557# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
558
559def _ss(data, c=None):
560 """Return sum of square deviations of sequence data.
561
562 If ``c`` is None, the mean is calculated in one pass, and the deviations
563 from the mean are calculated in a second pass. Otherwise, deviations are
564 calculated from ``c`` as given. Use the second case with care, as it can
565 lead to garbage results.
566 """
567 if c is None:
568 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100569 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700570 # The following sum should mathematically equal zero, but due to rounding
571 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100572 U, total2, count2 = _sum((x-c) for x in data)
573 assert T == U and count == count2
574 total -= total2**2/len(data)
575 assert not total < 0, 'negative sum of square deviations: %f' % total
576 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700577
578
579def variance(data, xbar=None):
580 """Return the sample variance of data.
581
582 data should be an iterable of Real-valued numbers, with at least two
583 values. The optional argument xbar, if given, should be the mean of
584 the data. If it is missing or None, the mean is automatically calculated.
585
586 Use this function when your data is a sample from a population. To
587 calculate the variance from the entire population, see ``pvariance``.
588
589 Examples:
590
591 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
592 >>> variance(data)
593 1.3720238095238095
594
595 If you have already calculated the mean of your data, you can pass it as
596 the optional second argument ``xbar`` to avoid recalculating it:
597
598 >>> m = mean(data)
599 >>> variance(data, m)
600 1.3720238095238095
601
602 This function does not check that ``xbar`` is actually the mean of
603 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
604 impossible results.
605
606 Decimals and Fractions are supported:
607
608 >>> from decimal import Decimal as D
609 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
610 Decimal('31.01875')
611
612 >>> from fractions import Fraction as F
613 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
614 Fraction(67, 108)
615
616 """
617 if iter(data) is data:
618 data = list(data)
619 n = len(data)
620 if n < 2:
621 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100622 T, ss = _ss(data, xbar)
623 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700624
625
626def pvariance(data, mu=None):
627 """Return the population variance of ``data``.
628
629 data should be an iterable of Real-valued numbers, with at least one
630 value. The optional argument mu, if given, should be the mean of
631 the data. If it is missing or None, the mean is automatically calculated.
632
633 Use this function to calculate the variance from the entire population.
634 To estimate the variance from a sample, the ``variance`` function is
635 usually a better choice.
636
637 Examples:
638
639 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
640 >>> pvariance(data)
641 1.25
642
643 If you have already calculated the mean of the data, you can pass it as
644 the optional second argument to avoid recalculating it:
645
646 >>> mu = mean(data)
647 >>> pvariance(data, mu)
648 1.25
649
650 This function does not check that ``mu`` is actually the mean of ``data``.
651 Giving arbitrary values for ``mu`` may lead to invalid or impossible
652 results.
653
654 Decimals and Fractions are supported:
655
656 >>> from decimal import Decimal as D
657 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
658 Decimal('24.815')
659
660 >>> from fractions import Fraction as F
661 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
662 Fraction(13, 72)
663
664 """
665 if iter(data) is data:
666 data = list(data)
667 n = len(data)
668 if n < 1:
669 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100670 T, ss = _ss(data, mu)
671 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700672
673
674def stdev(data, xbar=None):
675 """Return the square root of the sample variance.
676
677 See ``variance`` for arguments and other details.
678
679 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
680 1.0810874155219827
681
682 """
683 var = variance(data, xbar)
684 try:
685 return var.sqrt()
686 except AttributeError:
687 return math.sqrt(var)
688
689
690def pstdev(data, mu=None):
691 """Return the square root of the population variance.
692
693 See ``pvariance`` for arguments and other details.
694
695 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
696 0.986893273527251
697
698 """
699 var = pvariance(data, mu)
700 try:
701 return var.sqrt()
702 except AttributeError:
703 return math.sqrt(var)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800704
705## Normal Distribution #####################################################
706
707class NormalDist:
708 'Normal distribution of a random variable'
709 # https://en.wikipedia.org/wiki/Normal_distribution
710 # https://en.wikipedia.org/wiki/Variance#Properties
711
712 __slots__ = ('mu', 'sigma')
713
714 def __init__(self, mu=0.0, sigma=1.0):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700715 'NormalDist where mu is the mean and sigma is the standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800716 if sigma < 0.0:
717 raise StatisticsError('sigma must be non-negative')
718 self.mu = mu
719 self.sigma = sigma
720
721 @classmethod
722 def from_samples(cls, data):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700723 'Make a normal distribution instance from sample data.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800724 if not isinstance(data, (list, tuple)):
725 data = list(data)
726 xbar = fmean(data)
727 return cls(xbar, stdev(data, xbar))
728
729 def samples(self, n, seed=None):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700730 'Generate *n* samples for a given mean and standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800731 gauss = random.gauss if seed is None else random.Random(seed).gauss
732 mu, sigma = self.mu, self.sigma
733 return [gauss(mu, sigma) for i in range(n)]
734
735 def pdf(self, x):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700736 'Probability density function. P(x <= X < x+dx) / dx'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800737 variance = self.sigma ** 2.0
738 if not variance:
739 raise StatisticsError('pdf() not defined when sigma is zero')
740 return exp((x - self.mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance)
741
742 def cdf(self, x):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700743 'Cumulative distribution function. P(X <= x)'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800744 if not self.sigma:
745 raise StatisticsError('cdf() not defined when sigma is zero')
746 return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
747
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700748 def inv_cdf(self, p):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700749 '''Inverse cumulative distribution function. x : P(X <= x) = p
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700750
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700751 Finds the value of the random variable such that the probability of the
752 variable being less than or equal to that value equals the given probability.
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700753
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700754 This function is also called the percent point function or quantile function.
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700755 '''
756 if (p <= 0.0 or p >= 1.0):
757 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
758 if self.sigma <= 0.0:
759 raise StatisticsError('cdf() not defined when sigma at or below zero')
760
761 # There is no closed-form solution to the inverse CDF for the normal
762 # distribution, so we use a rational approximation instead:
763 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
764 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
765 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
766
767 q = p - 0.5
768 if fabs(q) <= 0.425:
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700769 r = 0.180625 - q * q
Raymond Hettingerfe138832019-03-19 14:29:13 -0700770 num = (((((((2.50908_09287_30122_6727e+3 * r +
771 3.34305_75583_58812_8105e+4) * r +
772 6.72657_70927_00870_0853e+4) * r +
773 4.59219_53931_54987_1457e+4) * r +
774 1.37316_93765_50946_1125e+4) * r +
775 1.97159_09503_06551_4427e+3) * r +
776 1.33141_66789_17843_7745e+2) * r +
777 3.38713_28727_96366_6080e+0) * q
778 den = (((((((5.22649_52788_52854_5610e+3 * r +
779 2.87290_85735_72194_2674e+4) * r +
780 3.93078_95800_09271_0610e+4) * r +
781 2.12137_94301_58659_5867e+4) * r +
782 5.39419_60214_24751_1077e+3) * r +
783 6.87187_00749_20579_0830e+2) * r +
784 4.23133_30701_60091_1252e+1) * r +
785 1.0)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700786 x = num / den
787 return self.mu + (x * self.sigma)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700788 r = p if q <= 0.0 else 1.0 - p
789 r = sqrt(-log(r))
790 if r <= 5.0:
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700791 r = r - 1.6
Raymond Hettingerfe138832019-03-19 14:29:13 -0700792 num = (((((((7.74545_01427_83414_07640e-4 * r +
793 2.27238_44989_26918_45833e-2) * r +
794 2.41780_72517_74506_11770e-1) * r +
795 1.27045_82524_52368_38258e+0) * r +
796 3.64784_83247_63204_60504e+0) * r +
797 5.76949_72214_60691_40550e+0) * r +
798 4.63033_78461_56545_29590e+0) * r +
799 1.42343_71107_49683_57734e+0)
800 den = (((((((1.05075_00716_44416_84324e-9 * r +
801 5.47593_80849_95344_94600e-4) * r +
802 1.51986_66563_61645_71966e-2) * r +
803 1.48103_97642_74800_74590e-1) * r +
804 6.89767_33498_51000_04550e-1) * r +
805 1.67638_48301_83803_84940e+0) * r +
806 2.05319_16266_37758_82187e+0) * r +
807 1.0)
Raymond Hettinger52a594b2019-03-19 12:48:04 -0700808 else:
809 r = r - 5.0
Raymond Hettingerfe138832019-03-19 14:29:13 -0700810 num = (((((((2.01033_43992_92288_13265e-7 * r +
811 2.71155_55687_43487_57815e-5) * r +
812 1.24266_09473_88078_43860e-3) * r +
813 2.65321_89526_57612_30930e-2) * r +
814 2.96560_57182_85048_91230e-1) * r +
815 1.78482_65399_17291_33580e+0) * r +
816 5.46378_49111_64114_36990e+0) * r +
817 6.65790_46435_01103_77720e+0)
818 den = (((((((2.04426_31033_89939_78564e-15 * r +
819 1.42151_17583_16445_88870e-7) * r +
820 1.84631_83175_10054_68180e-5) * r +
821 7.86869_13114_56132_59100e-4) * r +
822 1.48753_61290_85061_48525e-2) * r +
823 1.36929_88092_27358_05310e-1) * r +
824 5.99832_20655_58879_37690e-1) * r +
825 1.0)
Raymond Hettinger714c60d2019-03-18 20:17:14 -0700826 x = num / den
827 if q < 0.0:
828 x = -x
829 return self.mu + (x * self.sigma)
830
Raymond Hettinger318d5372019-03-06 22:59:40 -0800831 def overlap(self, other):
832 '''Compute the overlapping coefficient (OVL) between two normal distributions.
833
834 Measures the agreement between two normal probability distributions.
835 Returns a value between 0.0 and 1.0 giving the overlapping area in
836 the two underlying probability density functions.
837
838 >>> N1 = NormalDist(2.4, 1.6)
839 >>> N2 = NormalDist(3.2, 2.0)
840 >>> N1.overlap(N2)
841 0.8035050657330205
Raymond Hettinger318d5372019-03-06 22:59:40 -0800842 '''
843 # See: "The overlapping coefficient as a measure of agreement between
844 # probability distributions and point estimation of the overlap of two
845 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
846 # http://dx.doi.org/10.1080/03610928908830127
847 if not isinstance(other, NormalDist):
848 raise TypeError('Expected another NormalDist instance')
849 X, Y = self, other
850 if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
851 X, Y = Y, X
852 X_var, Y_var = X.variance, Y.variance
853 if not X_var or not Y_var:
854 raise StatisticsError('overlap() not defined when sigma is zero')
855 dv = Y_var - X_var
856 dm = fabs(Y.mu - X.mu)
857 if not dv:
Raymond Hettinger41f0b782019-03-14 02:25:26 -0700858 return 1.0 - erf(dm / (2.0 * X.sigma * sqrt(2.0)))
Raymond Hettinger318d5372019-03-06 22:59:40 -0800859 a = X.mu * Y_var - Y.mu * X_var
860 b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
861 x1 = (a + b) / dv
862 x2 = (a - b) / dv
863 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
864
Raymond Hettinger11c79532019-02-23 14:44:07 -0800865 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800866 def mean(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700867 'Arithmetic mean of the normal distribution.'
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800868 return self.mu
869
870 @property
871 def stdev(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700872 'Standard deviation of the normal distribution.'
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800873 return self.sigma
874
875 @property
Raymond Hettinger11c79532019-02-23 14:44:07 -0800876 def variance(self):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700877 'Square of the standard deviation.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800878 return self.sigma ** 2.0
879
880 def __add__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700881 '''Add a constant or another NormalDist instance.
882
883 If *other* is a constant, translate mu by the constant,
884 leaving sigma unchanged.
885
886 If *other* is a NormalDist, add both the means and the variances.
887 Mathematically, this works only if the two distributions are
888 independent or if they are jointly normally distributed.
889 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800890 if isinstance(x2, NormalDist):
891 return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
892 return NormalDist(x1.mu + x2, x1.sigma)
893
894 def __sub__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700895 '''Subtract a constant or another NormalDist instance.
896
897 If *other* is a constant, translate by the constant mu,
898 leaving sigma unchanged.
899
900 If *other* is a NormalDist, subtract the means and add the variances.
901 Mathematically, this works only if the two distributions are
902 independent or if they are jointly normally distributed.
903 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800904 if isinstance(x2, NormalDist):
905 return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
906 return NormalDist(x1.mu - x2, x1.sigma)
907
908 def __mul__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700909 '''Multiply both mu and sigma by a constant.
910
911 Used for rescaling, perhaps to change measurement units.
912 Sigma is scaled with the absolute value of the constant.
913 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800914 return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
915
916 def __truediv__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700917 '''Divide both mu and sigma by a constant.
918
919 Used for rescaling, perhaps to change measurement units.
920 Sigma is scaled with the absolute value of the constant.
921 '''
Raymond Hettinger11c79532019-02-23 14:44:07 -0800922 return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
923
924 def __pos__(x1):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700925 'Return a copy of the instance.'
Raymond Hettinger79fbcc52019-02-23 22:19:01 -0800926 return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800927
928 def __neg__(x1):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700929 'Negates mu while keeping sigma the same.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800930 return NormalDist(-x1.mu, x1.sigma)
931
932 __radd__ = __add__
933
934 def __rsub__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700935 'Subtract a NormalDist from a constant or another NormalDist.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800936 return -(x1 - x2)
937
938 __rmul__ = __mul__
939
940 def __eq__(x1, x2):
Raymond Hettinger5f1e8b42019-03-18 22:24:15 -0700941 'Two NormalDist objects are equal if their mu and sigma are both equal.'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800942 if not isinstance(x2, NormalDist):
943 return NotImplemented
944 return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
945
946 def __repr__(self):
947 return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
948
949
950if __name__ == '__main__':
951
952 # Show math operations computed analytically in comparsion
953 # to a monte carlo simulation of the same operations
954
955 from math import isclose
956 from operator import add, sub, mul, truediv
957 from itertools import repeat
Raymond Hettingerfc06a192019-03-12 00:43:27 -0700958 import doctest
Raymond Hettinger11c79532019-02-23 14:44:07 -0800959
960 g1 = NormalDist(10, 20)
961 g2 = NormalDist(-5, 25)
962
963 # Test scaling by a constant
964 assert (g1 * 5 / 5).mu == g1.mu
965 assert (g1 * 5 / 5).sigma == g1.sigma
966
967 n = 100_000
968 G1 = g1.samples(n)
969 G2 = g2.samples(n)
970
971 for func in (add, sub):
972 print(f'\nTest {func.__name__} with another NormalDist:')
973 print(func(g1, g2))
974 print(NormalDist.from_samples(map(func, G1, G2)))
975
976 const = 11
977 for func in (add, sub, mul, truediv):
978 print(f'\nTest {func.__name__} with a constant:')
979 print(func(g1, const))
980 print(NormalDist.from_samples(map(func, G1, repeat(const))))
981
982 const = 19
983 for func in (add, sub, mul):
984 print(f'\nTest constant with {func.__name__}:')
985 print(func(const, g1))
986 print(NormalDist.from_samples(map(func, repeat(const), G1)))
987
988 def assert_close(G1, G2):
989 assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
990 assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
991
992 X = NormalDist(-105, 73)
993 Y = NormalDist(31, 47)
994 s = 32.75
995 n = 100_000
996
997 S = NormalDist.from_samples([x + s for x in X.samples(n)])
998 assert_close(X + s, S)
999
1000 S = NormalDist.from_samples([x - s for x in X.samples(n)])
1001 assert_close(X - s, S)
1002
1003 S = NormalDist.from_samples([x * s for x in X.samples(n)])
1004 assert_close(X * s, S)
1005
1006 S = NormalDist.from_samples([x / s for x in X.samples(n)])
1007 assert_close(X / s, S)
1008
1009 S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
1010 Y.samples(n))])
1011 assert_close(X + Y, S)
1012
1013 S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
1014 Y.samples(n))])
1015 assert_close(X - Y, S)
Raymond Hettingerfc06a192019-03-12 00:43:27 -07001016
1017 print(doctest.testmod())