blob: e917a5dddd8be9ded944d390868467f348bca1fb [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== =============================================
11Function Description
12================== =============================================
13mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100014harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070015median Median (middle value) of data.
16median_low Low median of data.
17median_high High median of data.
18median_grouped Median, or 50th percentile, of grouped data.
19mode Mode (most common value) of data.
20================== =============================================
21
22Calculate the arithmetic mean ("the average") of data:
23
24>>> mean([-1.0, 2.5, 3.25, 5.75])
252.625
26
27
28Calculate the standard median of discrete data:
29
30>>> median([2, 3, 4, 5])
313.5
32
33
34Calculate the median, or 50th percentile, of data grouped into class intervals
35centred on the data values provided. E.g. if your data points are rounded to
36the nearest whole number:
37
38>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
392.8333333333...
40
41This should be interpreted in this way: you have two data points in the class
42interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
43the class interval 3.5-4.5. The median of these data points is 2.8333...
44
45
46Calculating variability or spread
47---------------------------------
48
49================== =============================================
50Function Description
51================== =============================================
52pvariance Population variance of data.
53variance Sample variance of data.
54pstdev Population standard deviation of data.
55stdev Sample standard deviation of data.
56================== =============================================
57
58Calculate the standard deviation of sample data:
59
60>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
614.38961843444...
62
63If you have previously calculated the mean, you can pass it as the optional
64second argument to the four "spread" functions to avoid recalculating it:
65
66>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
67>>> mu = mean(data)
68>>> pvariance(data, mu)
692.5
70
71
72Exceptions
73----------
74
75A single exception is defined: StatisticsError is a subclass of ValueError.
76
77"""
78
Raymond Hettinger11c79532019-02-23 14:44:07 -080079__all__ = [ 'StatisticsError', 'NormalDist',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070080 'pstdev', 'pvariance', 'stdev', 'variance',
81 'median', 'median_low', 'median_high', 'median_grouped',
Raymond Hettinger47d99872019-02-21 15:06:29 -080082 'mean', 'mode', 'harmonic_mean', 'fmean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070083 ]
84
Larry Hastingsf5e987b2013-10-19 11:50:09 -070085import collections
86import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +100087import numbers
Raymond Hettinger11c79532019-02-23 14:44:07 -080088import random
Larry Hastingsf5e987b2013-10-19 11:50:09 -070089
90from fractions import Fraction
91from decimal import Decimal
Victor Stinnerd6debb22017-03-27 16:05:26 +020092from itertools import groupby
Steven D'Aprano3b06e242016-05-05 03:54:29 +100093from bisect import bisect_left, bisect_right
Raymond Hettinger11c79532019-02-23 14:44:07 -080094from math import hypot, sqrt, fabs, exp, erf, tau
Steven D'Apranob28c3272015-12-01 19:59:53 +110095
Larry Hastingsf5e987b2013-10-19 11:50:09 -070096
97
98# === Exceptions ===
99
100class StatisticsError(ValueError):
101 pass
102
103
104# === Private utilities ===
105
106def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700108
Steven D'Apranob28c3272015-12-01 19:59:53 +1100109 Return a high-precision sum of the given numeric data as a fraction,
110 together with the type to be converted to and the count of items.
111
112 If optional argument ``start`` is given, it is added to the total.
113 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700114
115
116 Examples
117 --------
118
119 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700120 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700121
122 Some sources of round-off error will be avoided:
123
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000124 # Built-in sum returns zero.
125 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700126 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700127
128 Fractions and Decimals are also supported:
129
130 >>> from fractions import Fraction as F
131 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700132 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134 >>> from decimal import Decimal as D
135 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
136 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700137 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700138
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000139 Mixed types are currently treated as an error, except that int is
140 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100142 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100144 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700145 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100146 T = _coerce(int, type(start))
147 for typ, values in groupby(data, type):
148 T = _coerce(T, typ) # or raise TypeError
149 for n,d in map(_exact_ratio, values):
150 count += 1
151 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 # The sum will be a NAN or INF. We can ignore all the finite
154 # partials, and just look at this special one.
155 total = partials[None]
156 assert not _isfinite(total)
157 else:
158 # Sum all the partial sums using builtin sum.
159 # FIXME is this faster if we sum them in order of the denominator?
160 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
161 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700162
163
Steven D'Apranob28c3272015-12-01 19:59:53 +1100164def _isfinite(x):
165 try:
166 return x.is_finite() # Likely a Decimal.
167 except AttributeError:
168 return math.isfinite(x) # Coerces to float first.
169
170
171def _coerce(T, S):
172 """Coerce types T and S to a common type, or raise TypeError.
173
174 Coercion rules are currently an implementation detail. See the CoerceTest
175 test class in test_statistics for details.
176 """
177 # See http://bugs.python.org/issue24068.
178 assert T is not bool, "initial type T is bool"
179 # If the types are the same, no need to coerce anything. Put this
180 # first, so that the usual case (no coercion needed) happens as soon
181 # as possible.
182 if T is S: return T
183 # Mixed int & other coerce to the other type.
184 if S is int or S is bool: return T
185 if T is int: return S
186 # If one is a (strict) subclass of the other, coerce to the subclass.
187 if issubclass(S, T): return S
188 if issubclass(T, S): return T
189 # Ints coerce to the other type.
190 if issubclass(T, int): return S
191 if issubclass(S, int): return T
192 # Mixed fraction & float coerces to float (or float subclass).
193 if issubclass(T, Fraction) and issubclass(S, float):
194 return S
195 if issubclass(T, float) and issubclass(S, Fraction):
196 return T
197 # Any other combination is disallowed.
198 msg = "don't know how to coerce %s and %s"
199 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000200
201
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700202def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100203 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700204
205 >>> _exact_ratio(0.25)
206 (1, 4)
207
208 x is expected to be an int, Fraction, Decimal or float.
209 """
210 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100211 # Optimise the common case of floats. We expect that the most often
212 # used numeric type will be builtin floats, so try to make this as
213 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000214 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100215 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700216 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100217 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218 return (x.numerator, x.denominator)
219 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000221 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700222 return x.as_integer_ratio()
223 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000224 # Just give up?
225 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700226 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000228 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700229 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100230 msg = "can't convert type '{}' to numerator/denominator"
231 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232
233
Steven D'Apranob28c3272015-12-01 19:59:53 +1100234def _convert(value, T):
235 """Convert value to given numeric type T."""
236 if type(value) is T:
237 # This covers the cases where T is Fraction, or where value is
238 # a NAN or INF (Decimal or float).
239 return value
240 if issubclass(T, int) and value.denominator != 1:
241 T = float
242 try:
243 # FIXME: what do we do if this overflows?
244 return T(value)
245 except TypeError:
246 if issubclass(T, Decimal):
247 return T(value.numerator)/T(value.denominator)
248 else:
249 raise
250
251
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700252def _counts(data):
253 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000254 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700255 if not table:
256 return table
257 # Extract the values with the highest frequency.
258 maxfreq = table[0][1]
259 for i in range(1, len(table)):
260 if table[i][1] != maxfreq:
261 table = table[:i]
262 break
263 return table
264
265
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000266def _find_lteq(a, x):
267 'Locate the leftmost value exactly equal to x'
268 i = bisect_left(a, x)
269 if i != len(a) and a[i] == x:
270 return i
271 raise ValueError
272
273
274def _find_rteq(a, l, x):
275 'Locate the rightmost value exactly equal to x'
276 i = bisect_right(a, x, lo=l)
277 if i != (len(a)+1) and a[i-1] == x:
278 return i-1
279 raise ValueError
280
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000281
282def _fail_neg(values, errmsg='negative value'):
283 """Iterate over values, failing if any are less than zero."""
284 for x in values:
285 if x < 0:
286 raise StatisticsError(errmsg)
287 yield x
288
289
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700290# === Measures of central tendency (averages) ===
291
292def mean(data):
293 """Return the sample arithmetic mean of data.
294
295 >>> mean([1, 2, 3, 4, 4])
296 2.8
297
298 >>> from fractions import Fraction as F
299 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
300 Fraction(13, 21)
301
302 >>> from decimal import Decimal as D
303 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
304 Decimal('0.5625')
305
306 If ``data`` is empty, StatisticsError will be raised.
307 """
308 if iter(data) is data:
309 data = list(data)
310 n = len(data)
311 if n < 1:
312 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100313 T, total, count = _sum(data)
314 assert count == n
315 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700316
Raymond Hettinger47d99872019-02-21 15:06:29 -0800317def fmean(data):
318 """ Convert data to floats and compute the arithmetic mean.
319
320 This runs faster than the mean() function and it always returns a float.
321 The result is highly accurate but not as perfect as mean().
322 If the input dataset is empty, it raises a StatisticsError.
323
324 >>> fmean([3.5, 4.0, 5.25])
325 4.25
326
327 """
328 try:
329 n = len(data)
330 except TypeError:
331 # Handle iterators that do not define __len__().
332 n = 0
333 def count(x):
334 nonlocal n
335 n += 1
336 return x
337 total = math.fsum(map(count, data))
338 else:
339 total = math.fsum(data)
340 try:
341 return total / n
342 except ZeroDivisionError:
343 raise StatisticsError('fmean requires at least one data point') from None
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700344
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000345def harmonic_mean(data):
346 """Return the harmonic mean of data.
347
348 The harmonic mean, sometimes called the subcontrary mean, is the
349 reciprocal of the arithmetic mean of the reciprocals of the data,
350 and is often appropriate when averaging quantities which are rates
351 or ratios, for example speeds. Example:
352
353 Suppose an investor purchases an equal value of shares in each of
354 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
355 What is the average P/E ratio for the investor's portfolio?
356
357 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
358 3.6
359
360 Using the arithmetic mean would give an average of about 5.167, which
361 is too high.
362
363 If ``data`` is empty, or any element is less than zero,
364 ``harmonic_mean`` will raise ``StatisticsError``.
365 """
366 # For a justification for using harmonic mean for P/E ratios, see
367 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
368 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
369 if iter(data) is data:
370 data = list(data)
371 errmsg = 'harmonic mean does not support negative values'
372 n = len(data)
373 if n < 1:
374 raise StatisticsError('harmonic_mean requires at least one data point')
375 elif n == 1:
376 x = data[0]
377 if isinstance(x, (numbers.Real, Decimal)):
378 if x < 0:
379 raise StatisticsError(errmsg)
380 return x
381 else:
382 raise TypeError('unsupported type')
383 try:
384 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
385 except ZeroDivisionError:
386 return 0
387 assert count == n
388 return _convert(n/total, T)
389
390
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700391# FIXME: investigate ways to calculate medians without sorting? Quickselect?
392def median(data):
393 """Return the median (middle value) of numeric data.
394
395 When the number of data points is odd, return the middle data point.
396 When the number of data points is even, the median is interpolated by
397 taking the average of the two middle values:
398
399 >>> median([1, 3, 5])
400 3
401 >>> median([1, 3, 5, 7])
402 4.0
403
404 """
405 data = sorted(data)
406 n = len(data)
407 if n == 0:
408 raise StatisticsError("no median for empty data")
409 if n%2 == 1:
410 return data[n//2]
411 else:
412 i = n//2
413 return (data[i - 1] + data[i])/2
414
415
416def median_low(data):
417 """Return the low median of numeric data.
418
419 When the number of data points is odd, the middle value is returned.
420 When it is even, the smaller of the two middle values is returned.
421
422 >>> median_low([1, 3, 5])
423 3
424 >>> median_low([1, 3, 5, 7])
425 3
426
427 """
428 data = sorted(data)
429 n = len(data)
430 if n == 0:
431 raise StatisticsError("no median for empty data")
432 if n%2 == 1:
433 return data[n//2]
434 else:
435 return data[n//2 - 1]
436
437
438def median_high(data):
439 """Return the high median of data.
440
441 When the number of data points is odd, the middle value is returned.
442 When it is even, the larger of the two middle values is returned.
443
444 >>> median_high([1, 3, 5])
445 3
446 >>> median_high([1, 3, 5, 7])
447 5
448
449 """
450 data = sorted(data)
451 n = len(data)
452 if n == 0:
453 raise StatisticsError("no median for empty data")
454 return data[n//2]
455
456
457def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500458 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700459
460 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
461 3.7
462 >>> median_grouped([52, 52, 53, 54])
463 52.5
464
465 This calculates the median as the 50th percentile, and should be
466 used when your data is continuous and grouped. In the above example,
467 the values 1, 2, 3, etc. actually represent the midpoint of classes
468 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
469 class 3.5-4.5, and interpolation is used to estimate it.
470
471 Optional argument ``interval`` represents the class interval, and
472 defaults to 1. Changing the class interval naturally will change the
473 interpolated 50th percentile value:
474
475 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
476 3.25
477 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
478 3.5
479
480 This function does not check whether the data points are at least
481 ``interval`` apart.
482 """
483 data = sorted(data)
484 n = len(data)
485 if n == 0:
486 raise StatisticsError("no median for empty data")
487 elif n == 1:
488 return data[0]
489 # Find the value at the midpoint. Remember this corresponds to the
490 # centre of the class interval.
491 x = data[n//2]
492 for obj in (x, interval):
493 if isinstance(obj, (str, bytes)):
494 raise TypeError('expected number but got %r' % obj)
495 try:
496 L = x - interval/2 # The lower limit of the median interval.
497 except TypeError:
498 # Mixed type. For now we just coerce to float.
499 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000500
501 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000502 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000503 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000504 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000505 # Assuming always l1 <= l2
506 l2 = _find_rteq(data, l1, x)
507 cf = l1
508 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700509 return L + interval*(n/2 - cf)/f
510
511
512def mode(data):
513 """Return the most common data point from discrete or nominal data.
514
515 ``mode`` assumes discrete data, and returns a single value. This is the
516 standard treatment of the mode as commonly taught in schools:
517
518 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
519 3
520
521 This also works with nominal (non-numeric) data:
522
523 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
524 'red'
525
526 If there is not exactly one most common value, ``mode`` will raise
527 StatisticsError.
528 """
529 # Generate a table of sorted (value, frequency) pairs.
530 table = _counts(data)
531 if len(table) == 1:
532 return table[0][0]
533 elif table:
534 raise StatisticsError(
535 'no unique mode; found %d equally common values' % len(table)
536 )
537 else:
538 raise StatisticsError('no mode for empty data')
539
540
541# === Measures of spread ===
542
543# See http://mathworld.wolfram.com/Variance.html
544# http://mathworld.wolfram.com/SampleVariance.html
545# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
546#
547# Under no circumstances use the so-called "computational formula for
548# variance", as that is only suitable for hand calculations with a small
549# amount of low-precision data. It has terrible numeric properties.
550#
551# See a comparison of three computational methods here:
552# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
553
554def _ss(data, c=None):
555 """Return sum of square deviations of sequence data.
556
557 If ``c`` is None, the mean is calculated in one pass, and the deviations
558 from the mean are calculated in a second pass. Otherwise, deviations are
559 calculated from ``c`` as given. Use the second case with care, as it can
560 lead to garbage results.
561 """
562 if c is None:
563 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100564 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700565 # The following sum should mathematically equal zero, but due to rounding
566 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100567 U, total2, count2 = _sum((x-c) for x in data)
568 assert T == U and count == count2
569 total -= total2**2/len(data)
570 assert not total < 0, 'negative sum of square deviations: %f' % total
571 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700572
573
574def variance(data, xbar=None):
575 """Return the sample variance of data.
576
577 data should be an iterable of Real-valued numbers, with at least two
578 values. The optional argument xbar, if given, should be the mean of
579 the data. If it is missing or None, the mean is automatically calculated.
580
581 Use this function when your data is a sample from a population. To
582 calculate the variance from the entire population, see ``pvariance``.
583
584 Examples:
585
586 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
587 >>> variance(data)
588 1.3720238095238095
589
590 If you have already calculated the mean of your data, you can pass it as
591 the optional second argument ``xbar`` to avoid recalculating it:
592
593 >>> m = mean(data)
594 >>> variance(data, m)
595 1.3720238095238095
596
597 This function does not check that ``xbar`` is actually the mean of
598 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
599 impossible results.
600
601 Decimals and Fractions are supported:
602
603 >>> from decimal import Decimal as D
604 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
605 Decimal('31.01875')
606
607 >>> from fractions import Fraction as F
608 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
609 Fraction(67, 108)
610
611 """
612 if iter(data) is data:
613 data = list(data)
614 n = len(data)
615 if n < 2:
616 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100617 T, ss = _ss(data, xbar)
618 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700619
620
621def pvariance(data, mu=None):
622 """Return the population variance of ``data``.
623
624 data should be an iterable of Real-valued numbers, with at least one
625 value. The optional argument mu, if given, should be the mean of
626 the data. If it is missing or None, the mean is automatically calculated.
627
628 Use this function to calculate the variance from the entire population.
629 To estimate the variance from a sample, the ``variance`` function is
630 usually a better choice.
631
632 Examples:
633
634 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
635 >>> pvariance(data)
636 1.25
637
638 If you have already calculated the mean of the data, you can pass it as
639 the optional second argument to avoid recalculating it:
640
641 >>> mu = mean(data)
642 >>> pvariance(data, mu)
643 1.25
644
645 This function does not check that ``mu`` is actually the mean of ``data``.
646 Giving arbitrary values for ``mu`` may lead to invalid or impossible
647 results.
648
649 Decimals and Fractions are supported:
650
651 >>> from decimal import Decimal as D
652 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
653 Decimal('24.815')
654
655 >>> from fractions import Fraction as F
656 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
657 Fraction(13, 72)
658
659 """
660 if iter(data) is data:
661 data = list(data)
662 n = len(data)
663 if n < 1:
664 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100665 T, ss = _ss(data, mu)
666 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700667
668
669def stdev(data, xbar=None):
670 """Return the square root of the sample variance.
671
672 See ``variance`` for arguments and other details.
673
674 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
675 1.0810874155219827
676
677 """
678 var = variance(data, xbar)
679 try:
680 return var.sqrt()
681 except AttributeError:
682 return math.sqrt(var)
683
684
685def pstdev(data, mu=None):
686 """Return the square root of the population variance.
687
688 See ``pvariance`` for arguments and other details.
689
690 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
691 0.986893273527251
692
693 """
694 var = pvariance(data, mu)
695 try:
696 return var.sqrt()
697 except AttributeError:
698 return math.sqrt(var)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800699
700## Normal Distribution #####################################################
701
702class NormalDist:
703 'Normal distribution of a random variable'
704 # https://en.wikipedia.org/wiki/Normal_distribution
705 # https://en.wikipedia.org/wiki/Variance#Properties
706
707 __slots__ = ('mu', 'sigma')
708
709 def __init__(self, mu=0.0, sigma=1.0):
710 'NormalDist where mu is the mean and sigma is the standard deviation'
711 if sigma < 0.0:
712 raise StatisticsError('sigma must be non-negative')
713 self.mu = mu
714 self.sigma = sigma
715
716 @classmethod
717 def from_samples(cls, data):
718 'Make a normal distribution instance from sample data'
719 if not isinstance(data, (list, tuple)):
720 data = list(data)
721 xbar = fmean(data)
722 return cls(xbar, stdev(data, xbar))
723
724 def samples(self, n, seed=None):
725 'Generate *n* samples for a given mean and standard deviation'
726 gauss = random.gauss if seed is None else random.Random(seed).gauss
727 mu, sigma = self.mu, self.sigma
728 return [gauss(mu, sigma) for i in range(n)]
729
730 def pdf(self, x):
731 'Probability density function: P(x <= X < x+dx) / dx'
732 variance = self.sigma ** 2.0
733 if not variance:
734 raise StatisticsError('pdf() not defined when sigma is zero')
735 return exp((x - self.mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance)
736
737 def cdf(self, x):
Raymond Hettingeref17fdb2019-02-28 09:16:25 -0800738 'Cumulative distribution function: P(X <= x)'
Raymond Hettinger11c79532019-02-23 14:44:07 -0800739 if not self.sigma:
740 raise StatisticsError('cdf() not defined when sigma is zero')
741 return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
742
743 @property
Raymond Hettinger9e456bc2019-02-24 11:44:55 -0800744 def mean(self):
745 'Arithmetic mean of the normal distribution'
746 return self.mu
747
748 @property
749 def stdev(self):
750 'Standard deviation of the normal distribution'
751 return self.sigma
752
753 @property
Raymond Hettinger11c79532019-02-23 14:44:07 -0800754 def variance(self):
755 'Square of the standard deviation'
756 return self.sigma ** 2.0
757
758 def __add__(x1, x2):
759 if isinstance(x2, NormalDist):
760 return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
761 return NormalDist(x1.mu + x2, x1.sigma)
762
763 def __sub__(x1, x2):
764 if isinstance(x2, NormalDist):
765 return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
766 return NormalDist(x1.mu - x2, x1.sigma)
767
768 def __mul__(x1, x2):
769 return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
770
771 def __truediv__(x1, x2):
772 return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
773
774 def __pos__(x1):
Raymond Hettinger79fbcc52019-02-23 22:19:01 -0800775 return NormalDist(x1.mu, x1.sigma)
Raymond Hettinger11c79532019-02-23 14:44:07 -0800776
777 def __neg__(x1):
778 return NormalDist(-x1.mu, x1.sigma)
779
780 __radd__ = __add__
781
782 def __rsub__(x1, x2):
783 return -(x1 - x2)
784
785 __rmul__ = __mul__
786
787 def __eq__(x1, x2):
788 if not isinstance(x2, NormalDist):
789 return NotImplemented
790 return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
791
792 def __repr__(self):
793 return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
794
795
796if __name__ == '__main__':
797
798 # Show math operations computed analytically in comparsion
799 # to a monte carlo simulation of the same operations
800
801 from math import isclose
802 from operator import add, sub, mul, truediv
803 from itertools import repeat
804
805 g1 = NormalDist(10, 20)
806 g2 = NormalDist(-5, 25)
807
808 # Test scaling by a constant
809 assert (g1 * 5 / 5).mu == g1.mu
810 assert (g1 * 5 / 5).sigma == g1.sigma
811
812 n = 100_000
813 G1 = g1.samples(n)
814 G2 = g2.samples(n)
815
816 for func in (add, sub):
817 print(f'\nTest {func.__name__} with another NormalDist:')
818 print(func(g1, g2))
819 print(NormalDist.from_samples(map(func, G1, G2)))
820
821 const = 11
822 for func in (add, sub, mul, truediv):
823 print(f'\nTest {func.__name__} with a constant:')
824 print(func(g1, const))
825 print(NormalDist.from_samples(map(func, G1, repeat(const))))
826
827 const = 19
828 for func in (add, sub, mul):
829 print(f'\nTest constant with {func.__name__}:')
830 print(func(const, g1))
831 print(NormalDist.from_samples(map(func, repeat(const), G1)))
832
833 def assert_close(G1, G2):
834 assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
835 assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
836
837 X = NormalDist(-105, 73)
838 Y = NormalDist(31, 47)
839 s = 32.75
840 n = 100_000
841
842 S = NormalDist.from_samples([x + s for x in X.samples(n)])
843 assert_close(X + s, S)
844
845 S = NormalDist.from_samples([x - s for x in X.samples(n)])
846 assert_close(X - s, S)
847
848 S = NormalDist.from_samples([x * s for x in X.samples(n)])
849 assert_close(X * s, S)
850
851 S = NormalDist.from_samples([x / s for x in X.samples(n)])
852 assert_close(X / s, S)
853
854 S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
855 Y.samples(n))])
856 assert_close(X + Y, S)
857
858 S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
859 Y.samples(n))])
860 assert_close(X - Y, S)