blob: 7d53e0c0e2b1785b82fbfc0737b09f39e89922cf [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== =============================================
11Function Description
12================== =============================================
13mean Arithmetic mean (average) of data.
Steven D'Apranoe5803d92016-08-24 12:17:00 +100014geometric_mean Geometric mean of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100015harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070016median Median (middle value) of data.
17median_low Low median of data.
18median_high High median of data.
19median_grouped Median, or 50th percentile, of grouped data.
20mode Mode (most common value) of data.
21================== =============================================
22
23Calculate the arithmetic mean ("the average") of data:
24
25>>> mean([-1.0, 2.5, 3.25, 5.75])
262.625
27
28
29Calculate the standard median of discrete data:
30
31>>> median([2, 3, 4, 5])
323.5
33
34
35Calculate the median, or 50th percentile, of data grouped into class intervals
36centred on the data values provided. E.g. if your data points are rounded to
37the nearest whole number:
38
39>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
402.8333333333...
41
42This should be interpreted in this way: you have two data points in the class
43interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
44the class interval 3.5-4.5. The median of these data points is 2.8333...
45
46
47Calculating variability or spread
48---------------------------------
49
50================== =============================================
51Function Description
52================== =============================================
53pvariance Population variance of data.
54variance Sample variance of data.
55pstdev Population standard deviation of data.
56stdev Sample standard deviation of data.
57================== =============================================
58
59Calculate the standard deviation of sample data:
60
61>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
624.38961843444...
63
64If you have previously calculated the mean, you can pass it as the optional
65second argument to the four "spread" functions to avoid recalculating it:
66
67>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
68>>> mu = mean(data)
69>>> pvariance(data, mu)
702.5
71
72
73Exceptions
74----------
75
76A single exception is defined: StatisticsError is a subclass of ValueError.
77
78"""
79
80__all__ = [ 'StatisticsError',
81 'pstdev', 'pvariance', 'stdev', 'variance',
82 'median', 'median_low', 'median_high', 'median_grouped',
Steven D'Apranoe5803d92016-08-24 12:17:00 +100083 'mean', 'mode', 'geometric_mean', 'harmonic_mean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070084 ]
85
Larry Hastingsf5e987b2013-10-19 11:50:09 -070086import collections
Steven D'Apranoa474afd2016-08-09 12:49:01 +100087import decimal
Larry Hastingsf5e987b2013-10-19 11:50:09 -070088import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +100089import numbers
Larry Hastingsf5e987b2013-10-19 11:50:09 -070090
91from fractions import Fraction
92from decimal import Decimal
Steven D'Apranoa474afd2016-08-09 12:49:01 +100093from itertools import groupby, chain
Steven D'Aprano3b06e242016-05-05 03:54:29 +100094from bisect import bisect_left, bisect_right
Steven D'Apranob28c3272015-12-01 19:59:53 +110095
Larry Hastingsf5e987b2013-10-19 11:50:09 -070096
97
98# === Exceptions ===
99
100class StatisticsError(ValueError):
101 pass
102
103
104# === Private utilities ===
105
106def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700108
Steven D'Apranob28c3272015-12-01 19:59:53 +1100109 Return a high-precision sum of the given numeric data as a fraction,
110 together with the type to be converted to and the count of items.
111
112 If optional argument ``start`` is given, it is added to the total.
113 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700114
115
116 Examples
117 --------
118
119 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700120 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700121
122 Some sources of round-off error will be avoided:
123
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000124 # Built-in sum returns zero.
125 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700126 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700127
128 Fractions and Decimals are also supported:
129
130 >>> from fractions import Fraction as F
131 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700132 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700133
134 >>> from decimal import Decimal as D
135 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
136 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700137 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700138
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000139 Mixed types are currently treated as an error, except that int is
140 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700141 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100142 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100144 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700145 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100146 T = _coerce(int, type(start))
147 for typ, values in groupby(data, type):
148 T = _coerce(T, typ) # or raise TypeError
149 for n,d in map(_exact_ratio, values):
150 count += 1
151 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700152 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100153 # The sum will be a NAN or INF. We can ignore all the finite
154 # partials, and just look at this special one.
155 total = partials[None]
156 assert not _isfinite(total)
157 else:
158 # Sum all the partial sums using builtin sum.
159 # FIXME is this faster if we sum them in order of the denominator?
160 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
161 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700162
163
Steven D'Apranob28c3272015-12-01 19:59:53 +1100164def _isfinite(x):
165 try:
166 return x.is_finite() # Likely a Decimal.
167 except AttributeError:
168 return math.isfinite(x) # Coerces to float first.
169
170
171def _coerce(T, S):
172 """Coerce types T and S to a common type, or raise TypeError.
173
174 Coercion rules are currently an implementation detail. See the CoerceTest
175 test class in test_statistics for details.
176 """
177 # See http://bugs.python.org/issue24068.
178 assert T is not bool, "initial type T is bool"
179 # If the types are the same, no need to coerce anything. Put this
180 # first, so that the usual case (no coercion needed) happens as soon
181 # as possible.
182 if T is S: return T
183 # Mixed int & other coerce to the other type.
184 if S is int or S is bool: return T
185 if T is int: return S
186 # If one is a (strict) subclass of the other, coerce to the subclass.
187 if issubclass(S, T): return S
188 if issubclass(T, S): return T
189 # Ints coerce to the other type.
190 if issubclass(T, int): return S
191 if issubclass(S, int): return T
192 # Mixed fraction & float coerces to float (or float subclass).
193 if issubclass(T, Fraction) and issubclass(S, float):
194 return S
195 if issubclass(T, float) and issubclass(S, Fraction):
196 return T
197 # Any other combination is disallowed.
198 msg = "don't know how to coerce %s and %s"
199 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000200
201
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700202def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100203 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700204
205 >>> _exact_ratio(0.25)
206 (1, 4)
207
208 x is expected to be an int, Fraction, Decimal or float.
209 """
210 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100211 # Optimise the common case of floats. We expect that the most often
212 # used numeric type will be builtin floats, so try to make this as
213 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000214 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100215 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700216 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100217 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218 return (x.numerator, x.denominator)
219 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000221 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700222 return x.as_integer_ratio()
223 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000224 # Just give up?
225 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700226 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000228 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700229 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100230 msg = "can't convert type '{}' to numerator/denominator"
231 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232
233
Steven D'Apranob28c3272015-12-01 19:59:53 +1100234def _convert(value, T):
235 """Convert value to given numeric type T."""
236 if type(value) is T:
237 # This covers the cases where T is Fraction, or where value is
238 # a NAN or INF (Decimal or float).
239 return value
240 if issubclass(T, int) and value.denominator != 1:
241 T = float
242 try:
243 # FIXME: what do we do if this overflows?
244 return T(value)
245 except TypeError:
246 if issubclass(T, Decimal):
247 return T(value.numerator)/T(value.denominator)
248 else:
249 raise
250
251
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700252def _counts(data):
253 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000254 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700255 if not table:
256 return table
257 # Extract the values with the highest frequency.
258 maxfreq = table[0][1]
259 for i in range(1, len(table)):
260 if table[i][1] != maxfreq:
261 table = table[:i]
262 break
263 return table
264
265
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000266def _find_lteq(a, x):
267 'Locate the leftmost value exactly equal to x'
268 i = bisect_left(a, x)
269 if i != len(a) and a[i] == x:
270 return i
271 raise ValueError
272
273
274def _find_rteq(a, l, x):
275 'Locate the rightmost value exactly equal to x'
276 i = bisect_right(a, x, lo=l)
277 if i != (len(a)+1) and a[i-1] == x:
278 return i-1
279 raise ValueError
280
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000281
282def _fail_neg(values, errmsg='negative value'):
283 """Iterate over values, failing if any are less than zero."""
284 for x in values:
285 if x < 0:
286 raise StatisticsError(errmsg)
287 yield x
288
289
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000290class _nroot_NS:
291 """Hands off! Don't touch!
292
293 Everything inside this namespace (class) is an even-more-private
294 implementation detail of the private _nth_root function.
295 """
296 # This class exists only to be used as a namespace, for convenience
297 # of being able to keep the related functions together, and to
298 # collapse the group in an editor. If this were C# or C++, I would
299 # use a Namespace, but the closest Python has is a class.
300 #
301 # FIXME possibly move this out into a separate module?
302 # That feels like overkill, and may encourage people to treat it as
303 # a public feature.
304 def __init__(self):
305 raise TypeError('namespace only, do not instantiate')
306
307 def nth_root(x, n):
308 """Return the positive nth root of numeric x.
309
310 This may be more accurate than ** or pow():
311
312 >>> math.pow(1000, 1.0/3) #doctest:+SKIP
313 9.999999999999998
314
315 >>> _nth_root(1000, 3)
316 10.0
317 >>> _nth_root(11**5, 5)
318 11.0
319 >>> _nth_root(2, 12)
320 1.0594630943592953
321
322 """
323 if not isinstance(n, int):
324 raise TypeError('degree n must be an int')
325 if n < 2:
326 raise ValueError('degree n must be 2 or more')
327 if isinstance(x, decimal.Decimal):
328 return _nroot_NS.decimal_nroot(x, n)
329 elif isinstance(x, numbers.Real):
330 return _nroot_NS.float_nroot(x, n)
331 else:
332 raise TypeError('expected a number, got %s') % type(x).__name__
333
334 def float_nroot(x, n):
335 """Handle nth root of Reals, treated as a float."""
336 assert isinstance(n, int) and n > 1
337 if x < 0:
Steven D'Apranod6ea3012016-08-24 12:48:12 +1000338 raise ValueError('domain error: root of negative number')
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000339 elif x == 0:
340 return math.copysign(0.0, x)
341 elif x > 0:
342 try:
343 isinfinity = math.isinf(x)
344 except OverflowError:
345 return _nroot_NS.bignum_nroot(x, n)
346 else:
347 if isinfinity:
348 return float('inf')
349 else:
350 return _nroot_NS.nroot(x, n)
351 else:
352 assert math.isnan(x)
353 return float('nan')
354
355 def nroot(x, n):
356 """Calculate x**(1/n), then improve the answer."""
357 # This uses math.pow() to calculate an initial guess for the root,
358 # then uses the iterated nroot algorithm to improve it.
359 #
360 # By my testing, about 8% of the time the iterated algorithm ends
361 # up converging to a result which is less accurate than the initial
362 # guess. [FIXME: is this still true?] In that case, we use the
363 # guess instead of the "improved" value. This way, we're never
364 # less accurate than math.pow().
365 r1 = math.pow(x, 1.0/n)
366 eps1 = abs(r1**n - x)
367 if eps1 == 0.0:
368 # r1 is the exact root, so we're done. By my testing, this
369 # occurs about 80% of the time for x < 1 and 30% of the
370 # time for x > 1.
371 return r1
372 else:
373 try:
374 r2 = _nroot_NS.iterated_nroot(x, n, r1)
375 except RuntimeError:
376 return r1
377 else:
378 eps2 = abs(r2**n - x)
379 if eps1 < eps2:
380 return r1
381 return r2
382
383 def iterated_nroot(a, n, g):
384 """Return the nth root of a, starting with guess g.
385
386 This is a special case of Newton's Method.
387 https://en.wikipedia.org/wiki/Nth_root_algorithm
388 """
389 np = n - 1
390 def iterate(r):
391 try:
392 return (np*r + a/math.pow(r, np))/n
393 except OverflowError:
394 # If r is large enough, r**np may overflow. If that
395 # happens, r**-np will be small, but not necessarily zero.
396 return (np*r + a*math.pow(r, -np))/n
397 # With a good guess, such as g = a**(1/n), this will converge in
398 # only a few iterations. However a poor guess can take thousands
399 # of iterations to converge, if at all. We guard against poor
400 # guesses by setting an upper limit to the number of iterations.
401 r1 = g
402 r2 = iterate(g)
403 for i in range(1000):
404 if r1 == r2:
405 break
406 # Use Floyd's cycle-finding algorithm to avoid being trapped
407 # in a cycle.
408 # https://en.wikipedia.org/wiki/Cycle_detection#Tortoise_and_hare
409 r1 = iterate(r1)
410 r2 = iterate(iterate(r2))
411 else:
412 # If the guess is particularly bad, the above may fail to
413 # converge in any reasonable time.
414 raise RuntimeError('nth-root failed to converge')
415 return r2
416
417 def decimal_nroot(x, n):
418 """Handle nth root of Decimals."""
419 assert isinstance(x, decimal.Decimal)
420 assert isinstance(n, int)
421 if x.is_snan():
422 # Signalling NANs always raise.
423 raise decimal.InvalidOperation('nth-root of snan')
424 if x.is_qnan():
425 # Quiet NANs only raise if the context is set to raise,
426 # otherwise return a NAN.
427 ctx = decimal.getcontext()
428 if ctx.traps[decimal.InvalidOperation]:
429 raise decimal.InvalidOperation('nth-root of nan')
430 else:
431 # Preserve the input NAN.
432 return x
Steven D'Apranod6ea3012016-08-24 12:48:12 +1000433 if x < 0:
434 raise ValueError('domain error: root of negative number')
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000435 if x.is_infinite():
436 return x
437 # FIXME this hasn't had the extensive testing of the float
438 # version _iterated_nroot so there's possibly some buggy
439 # corner cases buried in here. Can it overflow? Fail to
440 # converge or get trapped in a cycle? Converge to a less
441 # accurate root?
442 np = n - 1
443 def iterate(r):
444 return (np*r + x/r**np)/n
445 r0 = x**(decimal.Decimal(1)/n)
446 assert isinstance(r0, decimal.Decimal)
447 r1 = iterate(r0)
448 while True:
449 if r1 == r0:
450 return r1
451 r0, r1 = r1, iterate(r1)
452
453 def bignum_nroot(x, n):
454 """Return the nth root of a positive huge number."""
455 assert x > 0
456 # I state without proof that ⁿ√x ≈ ⁿ√2·ⁿ√(x//2)
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700457 # and that for sufficiently big x the error is acceptable.
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000458 # We now halve x until it is small enough to get the root.
459 m = 0
460 while True:
461 x //= 2
462 m += 1
463 try:
464 y = float(x)
465 except OverflowError:
466 continue
467 break
468 a = _nroot_NS.nroot(y, n)
469 # At this point, we want the nth-root of 2**m, or 2**(m/n).
470 # We can write that as 2**(q + r/n) = 2**q * ⁿ√2**r where q = m//n.
471 q, r = divmod(m, n)
472 b = 2**q * _nroot_NS.nroot(2**r, n)
473 return a * b
474
475
476# This is the (private) function for calculating nth roots:
477_nth_root = _nroot_NS.nth_root
478assert type(_nth_root) is type(lambda: None)
479
480
481def _product(values):
482 """Return product of values as (exponent, mantissa)."""
483 errmsg = 'mixed Decimal and float is not supported'
484 prod = 1
485 for x in values:
486 if isinstance(x, float):
487 break
488 prod *= x
489 else:
490 return (0, prod)
491 if isinstance(prod, Decimal):
492 raise TypeError(errmsg)
493 # Since floats can overflow easily, we calculate the product as a
494 # sort of poor-man's BigFloat. Given that:
495 #
496 # x = 2**p * m # p == power or exponent (scale), m = mantissa
497 #
498 # we can calculate the product of two (or more) x values as:
499 #
500 # x1*x2 = 2**p1*m1 * 2**p2*m2 = 2**(p1+p2)*(m1*m2)
501 #
502 mant, scale = 1, 0 #math.frexp(prod) # FIXME
503 for y in chain([x], values):
504 if isinstance(y, Decimal):
505 raise TypeError(errmsg)
506 m1, e1 = math.frexp(y)
507 m2, e2 = math.frexp(mant)
508 scale += (e1 + e2)
509 mant = m1*m2
510 return (scale, mant)
511
512
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700513# === Measures of central tendency (averages) ===
514
515def mean(data):
516 """Return the sample arithmetic mean of data.
517
518 >>> mean([1, 2, 3, 4, 4])
519 2.8
520
521 >>> from fractions import Fraction as F
522 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
523 Fraction(13, 21)
524
525 >>> from decimal import Decimal as D
526 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
527 Decimal('0.5625')
528
529 If ``data`` is empty, StatisticsError will be raised.
530 """
531 if iter(data) is data:
532 data = list(data)
533 n = len(data)
534 if n < 1:
535 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100536 T, total, count = _sum(data)
537 assert count == n
538 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700539
540
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000541def geometric_mean(data):
542 """Return the geometric mean of data.
543
544 The geometric mean is appropriate when averaging quantities which
545 are multiplied together rather than added, for example growth rates.
546 Suppose an investment grows by 10% in the first year, falls by 5% in
547 the second, then grows by 12% in the third, what is the average rate
548 of growth over the three years?
549
550 >>> geometric_mean([1.10, 0.95, 1.12])
551 1.0538483123382172
552
553 giving an average growth of 5.385%. Using the arithmetic mean will
554 give approximately 5.667%, which is too high.
555
556 ``StatisticsError`` will be raised if ``data`` is empty, or any
557 element is less than zero.
558 """
559 if iter(data) is data:
560 data = list(data)
561 errmsg = 'geometric mean does not support negative values'
562 n = len(data)
563 if n < 1:
564 raise StatisticsError('geometric_mean requires at least one data point')
565 elif n == 1:
566 x = data[0]
567 if isinstance(g, (numbers.Real, Decimal)):
568 if x < 0:
569 raise StatisticsError(errmsg)
570 return x
571 else:
572 raise TypeError('unsupported type')
573 else:
574 scale, prod = _product(_fail_neg(data, errmsg))
575 r = _nth_root(prod, n)
576 if scale:
577 p, q = divmod(scale, n)
578 s = 2**p * _nth_root(2**q, n)
579 else:
580 s = 1
581 return s*r
582
583
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000584def harmonic_mean(data):
585 """Return the harmonic mean of data.
586
587 The harmonic mean, sometimes called the subcontrary mean, is the
588 reciprocal of the arithmetic mean of the reciprocals of the data,
589 and is often appropriate when averaging quantities which are rates
590 or ratios, for example speeds. Example:
591
592 Suppose an investor purchases an equal value of shares in each of
593 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
594 What is the average P/E ratio for the investor's portfolio?
595
596 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
597 3.6
598
599 Using the arithmetic mean would give an average of about 5.167, which
600 is too high.
601
602 If ``data`` is empty, or any element is less than zero,
603 ``harmonic_mean`` will raise ``StatisticsError``.
604 """
605 # For a justification for using harmonic mean for P/E ratios, see
606 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
607 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
608 if iter(data) is data:
609 data = list(data)
610 errmsg = 'harmonic mean does not support negative values'
611 n = len(data)
612 if n < 1:
613 raise StatisticsError('harmonic_mean requires at least one data point')
614 elif n == 1:
615 x = data[0]
616 if isinstance(x, (numbers.Real, Decimal)):
617 if x < 0:
618 raise StatisticsError(errmsg)
619 return x
620 else:
621 raise TypeError('unsupported type')
622 try:
623 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
624 except ZeroDivisionError:
625 return 0
626 assert count == n
627 return _convert(n/total, T)
628
629
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700630# FIXME: investigate ways to calculate medians without sorting? Quickselect?
631def median(data):
632 """Return the median (middle value) of numeric data.
633
634 When the number of data points is odd, return the middle data point.
635 When the number of data points is even, the median is interpolated by
636 taking the average of the two middle values:
637
638 >>> median([1, 3, 5])
639 3
640 >>> median([1, 3, 5, 7])
641 4.0
642
643 """
644 data = sorted(data)
645 n = len(data)
646 if n == 0:
647 raise StatisticsError("no median for empty data")
648 if n%2 == 1:
649 return data[n//2]
650 else:
651 i = n//2
652 return (data[i - 1] + data[i])/2
653
654
655def median_low(data):
656 """Return the low median of numeric data.
657
658 When the number of data points is odd, the middle value is returned.
659 When it is even, the smaller of the two middle values is returned.
660
661 >>> median_low([1, 3, 5])
662 3
663 >>> median_low([1, 3, 5, 7])
664 3
665
666 """
667 data = sorted(data)
668 n = len(data)
669 if n == 0:
670 raise StatisticsError("no median for empty data")
671 if n%2 == 1:
672 return data[n//2]
673 else:
674 return data[n//2 - 1]
675
676
677def median_high(data):
678 """Return the high median of data.
679
680 When the number of data points is odd, the middle value is returned.
681 When it is even, the larger of the two middle values is returned.
682
683 >>> median_high([1, 3, 5])
684 3
685 >>> median_high([1, 3, 5, 7])
686 5
687
688 """
689 data = sorted(data)
690 n = len(data)
691 if n == 0:
692 raise StatisticsError("no median for empty data")
693 return data[n//2]
694
695
696def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500697 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700698
699 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
700 3.7
701 >>> median_grouped([52, 52, 53, 54])
702 52.5
703
704 This calculates the median as the 50th percentile, and should be
705 used when your data is continuous and grouped. In the above example,
706 the values 1, 2, 3, etc. actually represent the midpoint of classes
707 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
708 class 3.5-4.5, and interpolation is used to estimate it.
709
710 Optional argument ``interval`` represents the class interval, and
711 defaults to 1. Changing the class interval naturally will change the
712 interpolated 50th percentile value:
713
714 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
715 3.25
716 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
717 3.5
718
719 This function does not check whether the data points are at least
720 ``interval`` apart.
721 """
722 data = sorted(data)
723 n = len(data)
724 if n == 0:
725 raise StatisticsError("no median for empty data")
726 elif n == 1:
727 return data[0]
728 # Find the value at the midpoint. Remember this corresponds to the
729 # centre of the class interval.
730 x = data[n//2]
731 for obj in (x, interval):
732 if isinstance(obj, (str, bytes)):
733 raise TypeError('expected number but got %r' % obj)
734 try:
735 L = x - interval/2 # The lower limit of the median interval.
736 except TypeError:
737 # Mixed type. For now we just coerce to float.
738 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000739
740 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000741 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000742 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000743 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000744 # Assuming always l1 <= l2
745 l2 = _find_rteq(data, l1, x)
746 cf = l1
747 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700748 return L + interval*(n/2 - cf)/f
749
750
751def mode(data):
752 """Return the most common data point from discrete or nominal data.
753
754 ``mode`` assumes discrete data, and returns a single value. This is the
755 standard treatment of the mode as commonly taught in schools:
756
757 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
758 3
759
760 This also works with nominal (non-numeric) data:
761
762 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
763 'red'
764
765 If there is not exactly one most common value, ``mode`` will raise
766 StatisticsError.
767 """
768 # Generate a table of sorted (value, frequency) pairs.
769 table = _counts(data)
770 if len(table) == 1:
771 return table[0][0]
772 elif table:
773 raise StatisticsError(
774 'no unique mode; found %d equally common values' % len(table)
775 )
776 else:
777 raise StatisticsError('no mode for empty data')
778
779
780# === Measures of spread ===
781
782# See http://mathworld.wolfram.com/Variance.html
783# http://mathworld.wolfram.com/SampleVariance.html
784# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
785#
786# Under no circumstances use the so-called "computational formula for
787# variance", as that is only suitable for hand calculations with a small
788# amount of low-precision data. It has terrible numeric properties.
789#
790# See a comparison of three computational methods here:
791# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
792
793def _ss(data, c=None):
794 """Return sum of square deviations of sequence data.
795
796 If ``c`` is None, the mean is calculated in one pass, and the deviations
797 from the mean are calculated in a second pass. Otherwise, deviations are
798 calculated from ``c`` as given. Use the second case with care, as it can
799 lead to garbage results.
800 """
801 if c is None:
802 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100803 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700804 # The following sum should mathematically equal zero, but due to rounding
805 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100806 U, total2, count2 = _sum((x-c) for x in data)
807 assert T == U and count == count2
808 total -= total2**2/len(data)
809 assert not total < 0, 'negative sum of square deviations: %f' % total
810 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700811
812
813def variance(data, xbar=None):
814 """Return the sample variance of data.
815
816 data should be an iterable of Real-valued numbers, with at least two
817 values. The optional argument xbar, if given, should be the mean of
818 the data. If it is missing or None, the mean is automatically calculated.
819
820 Use this function when your data is a sample from a population. To
821 calculate the variance from the entire population, see ``pvariance``.
822
823 Examples:
824
825 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
826 >>> variance(data)
827 1.3720238095238095
828
829 If you have already calculated the mean of your data, you can pass it as
830 the optional second argument ``xbar`` to avoid recalculating it:
831
832 >>> m = mean(data)
833 >>> variance(data, m)
834 1.3720238095238095
835
836 This function does not check that ``xbar`` is actually the mean of
837 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
838 impossible results.
839
840 Decimals and Fractions are supported:
841
842 >>> from decimal import Decimal as D
843 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
844 Decimal('31.01875')
845
846 >>> from fractions import Fraction as F
847 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
848 Fraction(67, 108)
849
850 """
851 if iter(data) is data:
852 data = list(data)
853 n = len(data)
854 if n < 2:
855 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100856 T, ss = _ss(data, xbar)
857 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700858
859
860def pvariance(data, mu=None):
861 """Return the population variance of ``data``.
862
863 data should be an iterable of Real-valued numbers, with at least one
864 value. The optional argument mu, if given, should be the mean of
865 the data. If it is missing or None, the mean is automatically calculated.
866
867 Use this function to calculate the variance from the entire population.
868 To estimate the variance from a sample, the ``variance`` function is
869 usually a better choice.
870
871 Examples:
872
873 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
874 >>> pvariance(data)
875 1.25
876
877 If you have already calculated the mean of the data, you can pass it as
878 the optional second argument to avoid recalculating it:
879
880 >>> mu = mean(data)
881 >>> pvariance(data, mu)
882 1.25
883
884 This function does not check that ``mu`` is actually the mean of ``data``.
885 Giving arbitrary values for ``mu`` may lead to invalid or impossible
886 results.
887
888 Decimals and Fractions are supported:
889
890 >>> from decimal import Decimal as D
891 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
892 Decimal('24.815')
893
894 >>> from fractions import Fraction as F
895 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
896 Fraction(13, 72)
897
898 """
899 if iter(data) is data:
900 data = list(data)
901 n = len(data)
902 if n < 1:
903 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100904 T, ss = _ss(data, mu)
905 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700906
907
908def stdev(data, xbar=None):
909 """Return the square root of the sample variance.
910
911 See ``variance`` for arguments and other details.
912
913 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
914 1.0810874155219827
915
916 """
917 var = variance(data, xbar)
918 try:
919 return var.sqrt()
920 except AttributeError:
921 return math.sqrt(var)
922
923
924def pstdev(data, mu=None):
925 """Return the square root of the population variance.
926
927 See ``pvariance`` for arguments and other details.
928
929 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
930 0.986893273527251
931
932 """
933 var = pvariance(data, mu)
934 try:
935 return var.sqrt()
936 except AttributeError:
937 return math.sqrt(var)