blob: f4b49b5d0cafac996f1a2af483cd3fd6874646bd [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001## Module statistics.py
2##
3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5## Licensed under the Apache License, Version 2.0 (the "License");
6## you may not use this file except in compliance with the License.
7## You may obtain a copy of the License at
8##
9## http://www.apache.org/licenses/LICENSE-2.0
10##
11## Unless required by applicable law or agreed to in writing, software
12## distributed under the License is distributed on an "AS IS" BASIS,
13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14## See the License for the specific language governing permissions and
15## limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27================== =============================================
28Function Description
29================== =============================================
30mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100031harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070032median Median (middle value) of data.
33median_low Low median of data.
34median_high High median of data.
35median_grouped Median, or 50th percentile, of grouped data.
36mode Mode (most common value) of data.
37================== =============================================
38
39Calculate the arithmetic mean ("the average") of data:
40
41>>> mean([-1.0, 2.5, 3.25, 5.75])
422.625
43
44
45Calculate the standard median of discrete data:
46
47>>> median([2, 3, 4, 5])
483.5
49
50
51Calculate the median, or 50th percentile, of data grouped into class intervals
52centred on the data values provided. E.g. if your data points are rounded to
53the nearest whole number:
54
55>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
562.8333333333...
57
58This should be interpreted in this way: you have two data points in the class
59interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
60the class interval 3.5-4.5. The median of these data points is 2.8333...
61
62
63Calculating variability or spread
64---------------------------------
65
66================== =============================================
67Function Description
68================== =============================================
69pvariance Population variance of data.
70variance Sample variance of data.
71pstdev Population standard deviation of data.
72stdev Sample standard deviation of data.
73================== =============================================
74
75Calculate the standard deviation of sample data:
76
77>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
784.38961843444...
79
80If you have previously calculated the mean, you can pass it as the optional
81second argument to the four "spread" functions to avoid recalculating it:
82
83>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
84>>> mu = mean(data)
85>>> pvariance(data, mu)
862.5
87
88
89Exceptions
90----------
91
92A single exception is defined: StatisticsError is a subclass of ValueError.
93
94"""
95
96__all__ = [ 'StatisticsError',
97 'pstdev', 'pvariance', 'stdev', 'variance',
98 'median', 'median_low', 'median_high', 'median_grouped',
Steven D'Apranoa474afd2016-08-09 12:49:01 +100099 'mean', 'mode', 'harmonic_mean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700100 ]
101
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700102import collections
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000103import decimal
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700104import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000105import numbers
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700106
107from fractions import Fraction
108from decimal import Decimal
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000109from itertools import groupby, chain
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000110from bisect import bisect_left, bisect_right
Steven D'Apranob28c3272015-12-01 19:59:53 +1100111
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700112
113
114# === Exceptions ===
115
116class StatisticsError(ValueError):
117 pass
118
119
120# === Private utilities ===
121
122def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100123 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700124
Steven D'Apranob28c3272015-12-01 19:59:53 +1100125 Return a high-precision sum of the given numeric data as a fraction,
126 together with the type to be converted to and the count of items.
127
128 If optional argument ``start`` is given, it is added to the total.
129 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700130
131
132 Examples
133 --------
134
135 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700136 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700137
138 Some sources of round-off error will be avoided:
139
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000140 # Built-in sum returns zero.
141 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700142 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143
144 Fractions and Decimals are also supported:
145
146 >>> from fractions import Fraction as F
147 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700148 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700149
150 >>> from decimal import Decimal as D
151 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
152 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700153 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700154
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000155 Mixed types are currently treated as an error, except that int is
156 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700157 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100158 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700159 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100160 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700161 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100162 T = _coerce(int, type(start))
163 for typ, values in groupby(data, type):
164 T = _coerce(T, typ) # or raise TypeError
165 for n,d in map(_exact_ratio, values):
166 count += 1
167 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700168 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100169 # The sum will be a NAN or INF. We can ignore all the finite
170 # partials, and just look at this special one.
171 total = partials[None]
172 assert not _isfinite(total)
173 else:
174 # Sum all the partial sums using builtin sum.
175 # FIXME is this faster if we sum them in order of the denominator?
176 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
177 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700178
179
Steven D'Apranob28c3272015-12-01 19:59:53 +1100180def _isfinite(x):
181 try:
182 return x.is_finite() # Likely a Decimal.
183 except AttributeError:
184 return math.isfinite(x) # Coerces to float first.
185
186
187def _coerce(T, S):
188 """Coerce types T and S to a common type, or raise TypeError.
189
190 Coercion rules are currently an implementation detail. See the CoerceTest
191 test class in test_statistics for details.
192 """
193 # See http://bugs.python.org/issue24068.
194 assert T is not bool, "initial type T is bool"
195 # If the types are the same, no need to coerce anything. Put this
196 # first, so that the usual case (no coercion needed) happens as soon
197 # as possible.
198 if T is S: return T
199 # Mixed int & other coerce to the other type.
200 if S is int or S is bool: return T
201 if T is int: return S
202 # If one is a (strict) subclass of the other, coerce to the subclass.
203 if issubclass(S, T): return S
204 if issubclass(T, S): return T
205 # Ints coerce to the other type.
206 if issubclass(T, int): return S
207 if issubclass(S, int): return T
208 # Mixed fraction & float coerces to float (or float subclass).
209 if issubclass(T, Fraction) and issubclass(S, float):
210 return S
211 if issubclass(T, float) and issubclass(S, Fraction):
212 return T
213 # Any other combination is disallowed.
214 msg = "don't know how to coerce %s and %s"
215 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000216
217
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100219 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220
221 >>> _exact_ratio(0.25)
222 (1, 4)
223
224 x is expected to be an int, Fraction, Decimal or float.
225 """
226 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # Optimise the common case of floats. We expect that the most often
228 # used numeric type will be builtin floats, so try to make this as
229 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000230 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100231 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100233 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700234 return (x.numerator, x.denominator)
235 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700236 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000237 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700238 return x.as_integer_ratio()
239 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000240 # Just give up?
241 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700242 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100243 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000244 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700245 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100246 msg = "can't convert type '{}' to numerator/denominator"
247 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700248
249
Steven D'Apranob28c3272015-12-01 19:59:53 +1100250def _convert(value, T):
251 """Convert value to given numeric type T."""
252 if type(value) is T:
253 # This covers the cases where T is Fraction, or where value is
254 # a NAN or INF (Decimal or float).
255 return value
256 if issubclass(T, int) and value.denominator != 1:
257 T = float
258 try:
259 # FIXME: what do we do if this overflows?
260 return T(value)
261 except TypeError:
262 if issubclass(T, Decimal):
263 return T(value.numerator)/T(value.denominator)
264 else:
265 raise
266
267
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700268def _counts(data):
269 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000270 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700271 if not table:
272 return table
273 # Extract the values with the highest frequency.
274 maxfreq = table[0][1]
275 for i in range(1, len(table)):
276 if table[i][1] != maxfreq:
277 table = table[:i]
278 break
279 return table
280
281
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000282def _find_lteq(a, x):
283 'Locate the leftmost value exactly equal to x'
284 i = bisect_left(a, x)
285 if i != len(a) and a[i] == x:
286 return i
287 raise ValueError
288
289
290def _find_rteq(a, l, x):
291 'Locate the rightmost value exactly equal to x'
292 i = bisect_right(a, x, lo=l)
293 if i != (len(a)+1) and a[i-1] == x:
294 return i-1
295 raise ValueError
296
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000297
298def _fail_neg(values, errmsg='negative value'):
299 """Iterate over values, failing if any are less than zero."""
300 for x in values:
301 if x < 0:
302 raise StatisticsError(errmsg)
303 yield x
304
305
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000306class _nroot_NS:
307 """Hands off! Don't touch!
308
309 Everything inside this namespace (class) is an even-more-private
310 implementation detail of the private _nth_root function.
311 """
312 # This class exists only to be used as a namespace, for convenience
313 # of being able to keep the related functions together, and to
314 # collapse the group in an editor. If this were C# or C++, I would
315 # use a Namespace, but the closest Python has is a class.
316 #
317 # FIXME possibly move this out into a separate module?
318 # That feels like overkill, and may encourage people to treat it as
319 # a public feature.
320 def __init__(self):
321 raise TypeError('namespace only, do not instantiate')
322
323 def nth_root(x, n):
324 """Return the positive nth root of numeric x.
325
326 This may be more accurate than ** or pow():
327
328 >>> math.pow(1000, 1.0/3) #doctest:+SKIP
329 9.999999999999998
330
331 >>> _nth_root(1000, 3)
332 10.0
333 >>> _nth_root(11**5, 5)
334 11.0
335 >>> _nth_root(2, 12)
336 1.0594630943592953
337
338 """
339 if not isinstance(n, int):
340 raise TypeError('degree n must be an int')
341 if n < 2:
342 raise ValueError('degree n must be 2 or more')
343 if isinstance(x, decimal.Decimal):
344 return _nroot_NS.decimal_nroot(x, n)
345 elif isinstance(x, numbers.Real):
346 return _nroot_NS.float_nroot(x, n)
347 else:
348 raise TypeError('expected a number, got %s') % type(x).__name__
349
350 def float_nroot(x, n):
351 """Handle nth root of Reals, treated as a float."""
352 assert isinstance(n, int) and n > 1
353 if x < 0:
354 if n%2 == 0:
355 raise ValueError('domain error: even root of negative number')
356 else:
357 return -_nroot_NS.nroot(-x, n)
358 elif x == 0:
359 return math.copysign(0.0, x)
360 elif x > 0:
361 try:
362 isinfinity = math.isinf(x)
363 except OverflowError:
364 return _nroot_NS.bignum_nroot(x, n)
365 else:
366 if isinfinity:
367 return float('inf')
368 else:
369 return _nroot_NS.nroot(x, n)
370 else:
371 assert math.isnan(x)
372 return float('nan')
373
374 def nroot(x, n):
375 """Calculate x**(1/n), then improve the answer."""
376 # This uses math.pow() to calculate an initial guess for the root,
377 # then uses the iterated nroot algorithm to improve it.
378 #
379 # By my testing, about 8% of the time the iterated algorithm ends
380 # up converging to a result which is less accurate than the initial
381 # guess. [FIXME: is this still true?] In that case, we use the
382 # guess instead of the "improved" value. This way, we're never
383 # less accurate than math.pow().
384 r1 = math.pow(x, 1.0/n)
385 eps1 = abs(r1**n - x)
386 if eps1 == 0.0:
387 # r1 is the exact root, so we're done. By my testing, this
388 # occurs about 80% of the time for x < 1 and 30% of the
389 # time for x > 1.
390 return r1
391 else:
392 try:
393 r2 = _nroot_NS.iterated_nroot(x, n, r1)
394 except RuntimeError:
395 return r1
396 else:
397 eps2 = abs(r2**n - x)
398 if eps1 < eps2:
399 return r1
400 return r2
401
402 def iterated_nroot(a, n, g):
403 """Return the nth root of a, starting with guess g.
404
405 This is a special case of Newton's Method.
406 https://en.wikipedia.org/wiki/Nth_root_algorithm
407 """
408 np = n - 1
409 def iterate(r):
410 try:
411 return (np*r + a/math.pow(r, np))/n
412 except OverflowError:
413 # If r is large enough, r**np may overflow. If that
414 # happens, r**-np will be small, but not necessarily zero.
415 return (np*r + a*math.pow(r, -np))/n
416 # With a good guess, such as g = a**(1/n), this will converge in
417 # only a few iterations. However a poor guess can take thousands
418 # of iterations to converge, if at all. We guard against poor
419 # guesses by setting an upper limit to the number of iterations.
420 r1 = g
421 r2 = iterate(g)
422 for i in range(1000):
423 if r1 == r2:
424 break
425 # Use Floyd's cycle-finding algorithm to avoid being trapped
426 # in a cycle.
427 # https://en.wikipedia.org/wiki/Cycle_detection#Tortoise_and_hare
428 r1 = iterate(r1)
429 r2 = iterate(iterate(r2))
430 else:
431 # If the guess is particularly bad, the above may fail to
432 # converge in any reasonable time.
433 raise RuntimeError('nth-root failed to converge')
434 return r2
435
436 def decimal_nroot(x, n):
437 """Handle nth root of Decimals."""
438 assert isinstance(x, decimal.Decimal)
439 assert isinstance(n, int)
440 if x.is_snan():
441 # Signalling NANs always raise.
442 raise decimal.InvalidOperation('nth-root of snan')
443 if x.is_qnan():
444 # Quiet NANs only raise if the context is set to raise,
445 # otherwise return a NAN.
446 ctx = decimal.getcontext()
447 if ctx.traps[decimal.InvalidOperation]:
448 raise decimal.InvalidOperation('nth-root of nan')
449 else:
450 # Preserve the input NAN.
451 return x
452 if x.is_infinite():
453 return x
454 # FIXME this hasn't had the extensive testing of the float
455 # version _iterated_nroot so there's possibly some buggy
456 # corner cases buried in here. Can it overflow? Fail to
457 # converge or get trapped in a cycle? Converge to a less
458 # accurate root?
459 np = n - 1
460 def iterate(r):
461 return (np*r + x/r**np)/n
462 r0 = x**(decimal.Decimal(1)/n)
463 assert isinstance(r0, decimal.Decimal)
464 r1 = iterate(r0)
465 while True:
466 if r1 == r0:
467 return r1
468 r0, r1 = r1, iterate(r1)
469
470 def bignum_nroot(x, n):
471 """Return the nth root of a positive huge number."""
472 assert x > 0
473 # I state without proof that ⁿ√x ≈ ⁿ√2·ⁿ√(x//2)
474 # and that for sufficiently big x the error is acceptible.
475 # We now halve x until it is small enough to get the root.
476 m = 0
477 while True:
478 x //= 2
479 m += 1
480 try:
481 y = float(x)
482 except OverflowError:
483 continue
484 break
485 a = _nroot_NS.nroot(y, n)
486 # At this point, we want the nth-root of 2**m, or 2**(m/n).
487 # We can write that as 2**(q + r/n) = 2**q * ⁿ√2**r where q = m//n.
488 q, r = divmod(m, n)
489 b = 2**q * _nroot_NS.nroot(2**r, n)
490 return a * b
491
492
493# This is the (private) function for calculating nth roots:
494_nth_root = _nroot_NS.nth_root
495assert type(_nth_root) is type(lambda: None)
496
497
498def _product(values):
499 """Return product of values as (exponent, mantissa)."""
500 errmsg = 'mixed Decimal and float is not supported'
501 prod = 1
502 for x in values:
503 if isinstance(x, float):
504 break
505 prod *= x
506 else:
507 return (0, prod)
508 if isinstance(prod, Decimal):
509 raise TypeError(errmsg)
510 # Since floats can overflow easily, we calculate the product as a
511 # sort of poor-man's BigFloat. Given that:
512 #
513 # x = 2**p * m # p == power or exponent (scale), m = mantissa
514 #
515 # we can calculate the product of two (or more) x values as:
516 #
517 # x1*x2 = 2**p1*m1 * 2**p2*m2 = 2**(p1+p2)*(m1*m2)
518 #
519 mant, scale = 1, 0 #math.frexp(prod) # FIXME
520 for y in chain([x], values):
521 if isinstance(y, Decimal):
522 raise TypeError(errmsg)
523 m1, e1 = math.frexp(y)
524 m2, e2 = math.frexp(mant)
525 scale += (e1 + e2)
526 mant = m1*m2
527 return (scale, mant)
528
529
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700530# === Measures of central tendency (averages) ===
531
532def mean(data):
533 """Return the sample arithmetic mean of data.
534
535 >>> mean([1, 2, 3, 4, 4])
536 2.8
537
538 >>> from fractions import Fraction as F
539 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
540 Fraction(13, 21)
541
542 >>> from decimal import Decimal as D
543 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
544 Decimal('0.5625')
545
546 If ``data`` is empty, StatisticsError will be raised.
547 """
548 if iter(data) is data:
549 data = list(data)
550 n = len(data)
551 if n < 1:
552 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100553 T, total, count = _sum(data)
554 assert count == n
555 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700556
557
Steven D'Aprano9a2be912016-08-09 13:58:10 +1000558def geometric_mean(data):
559 """Return the geometric mean of data.
560
561 The geometric mean is appropriate when averaging quantities which
562 are multiplied together rather than added, for example growth rates.
563 Suppose an investment grows by 10% in the first year, falls by 5% in
564 the second, then grows by 12% in the third, what is the average rate
565 of growth over the three years?
566
567 >>> geometric_mean([1.10, 0.95, 1.12])
568 1.0538483123382172
569
570 giving an average growth of 5.385%. Using the arithmetic mean will
571 give approximately 5.667%, which is too high.
572
573 ``StatisticsError`` will be raised if ``data`` is empty, or any
574 element is less than zero.
575 """
576 if iter(data) is data:
577 data = list(data)
578 errmsg = 'geometric mean does not support negative values'
579 n = len(data)
580 if n < 1:
581 raise StatisticsError('geometric_mean requires at least one data point')
582 elif n == 1:
583 x = data[0]
584 if isinstance(g, (numbers.Real, Decimal)):
585 if x < 0:
586 raise StatisticsError(errmsg)
587 return x
588 else:
589 raise TypeError('unsupported type')
590 else:
591 scale, prod = _product(_fail_neg(data, errmsg))
592 r = _nth_root(prod, n)
593 if scale:
594 p, q = divmod(scale, n)
595 s = 2**p * _nth_root(2**q, n)
596 else:
597 s = 1
598 return s*r
599
600
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000601def harmonic_mean(data):
602 """Return the harmonic mean of data.
603
604 The harmonic mean, sometimes called the subcontrary mean, is the
605 reciprocal of the arithmetic mean of the reciprocals of the data,
606 and is often appropriate when averaging quantities which are rates
607 or ratios, for example speeds. Example:
608
609 Suppose an investor purchases an equal value of shares in each of
610 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
611 What is the average P/E ratio for the investor's portfolio?
612
613 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
614 3.6
615
616 Using the arithmetic mean would give an average of about 5.167, which
617 is too high.
618
619 If ``data`` is empty, or any element is less than zero,
620 ``harmonic_mean`` will raise ``StatisticsError``.
621 """
622 # For a justification for using harmonic mean for P/E ratios, see
623 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
624 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
625 if iter(data) is data:
626 data = list(data)
627 errmsg = 'harmonic mean does not support negative values'
628 n = len(data)
629 if n < 1:
630 raise StatisticsError('harmonic_mean requires at least one data point')
631 elif n == 1:
632 x = data[0]
633 if isinstance(x, (numbers.Real, Decimal)):
634 if x < 0:
635 raise StatisticsError(errmsg)
636 return x
637 else:
638 raise TypeError('unsupported type')
639 try:
640 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
641 except ZeroDivisionError:
642 return 0
643 assert count == n
644 return _convert(n/total, T)
645
646
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700647# FIXME: investigate ways to calculate medians without sorting? Quickselect?
648def median(data):
649 """Return the median (middle value) of numeric data.
650
651 When the number of data points is odd, return the middle data point.
652 When the number of data points is even, the median is interpolated by
653 taking the average of the two middle values:
654
655 >>> median([1, 3, 5])
656 3
657 >>> median([1, 3, 5, 7])
658 4.0
659
660 """
661 data = sorted(data)
662 n = len(data)
663 if n == 0:
664 raise StatisticsError("no median for empty data")
665 if n%2 == 1:
666 return data[n//2]
667 else:
668 i = n//2
669 return (data[i - 1] + data[i])/2
670
671
672def median_low(data):
673 """Return the low median of numeric data.
674
675 When the number of data points is odd, the middle value is returned.
676 When it is even, the smaller of the two middle values is returned.
677
678 >>> median_low([1, 3, 5])
679 3
680 >>> median_low([1, 3, 5, 7])
681 3
682
683 """
684 data = sorted(data)
685 n = len(data)
686 if n == 0:
687 raise StatisticsError("no median for empty data")
688 if n%2 == 1:
689 return data[n//2]
690 else:
691 return data[n//2 - 1]
692
693
694def median_high(data):
695 """Return the high median of data.
696
697 When the number of data points is odd, the middle value is returned.
698 When it is even, the larger of the two middle values is returned.
699
700 >>> median_high([1, 3, 5])
701 3
702 >>> median_high([1, 3, 5, 7])
703 5
704
705 """
706 data = sorted(data)
707 n = len(data)
708 if n == 0:
709 raise StatisticsError("no median for empty data")
710 return data[n//2]
711
712
713def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500714 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700715
716 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
717 3.7
718 >>> median_grouped([52, 52, 53, 54])
719 52.5
720
721 This calculates the median as the 50th percentile, and should be
722 used when your data is continuous and grouped. In the above example,
723 the values 1, 2, 3, etc. actually represent the midpoint of classes
724 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
725 class 3.5-4.5, and interpolation is used to estimate it.
726
727 Optional argument ``interval`` represents the class interval, and
728 defaults to 1. Changing the class interval naturally will change the
729 interpolated 50th percentile value:
730
731 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
732 3.25
733 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
734 3.5
735
736 This function does not check whether the data points are at least
737 ``interval`` apart.
738 """
739 data = sorted(data)
740 n = len(data)
741 if n == 0:
742 raise StatisticsError("no median for empty data")
743 elif n == 1:
744 return data[0]
745 # Find the value at the midpoint. Remember this corresponds to the
746 # centre of the class interval.
747 x = data[n//2]
748 for obj in (x, interval):
749 if isinstance(obj, (str, bytes)):
750 raise TypeError('expected number but got %r' % obj)
751 try:
752 L = x - interval/2 # The lower limit of the median interval.
753 except TypeError:
754 # Mixed type. For now we just coerce to float.
755 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000756
757 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000758 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000759 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000760 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000761 # Assuming always l1 <= l2
762 l2 = _find_rteq(data, l1, x)
763 cf = l1
764 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700765 return L + interval*(n/2 - cf)/f
766
767
768def mode(data):
769 """Return the most common data point from discrete or nominal data.
770
771 ``mode`` assumes discrete data, and returns a single value. This is the
772 standard treatment of the mode as commonly taught in schools:
773
774 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
775 3
776
777 This also works with nominal (non-numeric) data:
778
779 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
780 'red'
781
782 If there is not exactly one most common value, ``mode`` will raise
783 StatisticsError.
784 """
785 # Generate a table of sorted (value, frequency) pairs.
786 table = _counts(data)
787 if len(table) == 1:
788 return table[0][0]
789 elif table:
790 raise StatisticsError(
791 'no unique mode; found %d equally common values' % len(table)
792 )
793 else:
794 raise StatisticsError('no mode for empty data')
795
796
797# === Measures of spread ===
798
799# See http://mathworld.wolfram.com/Variance.html
800# http://mathworld.wolfram.com/SampleVariance.html
801# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
802#
803# Under no circumstances use the so-called "computational formula for
804# variance", as that is only suitable for hand calculations with a small
805# amount of low-precision data. It has terrible numeric properties.
806#
807# See a comparison of three computational methods here:
808# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
809
810def _ss(data, c=None):
811 """Return sum of square deviations of sequence data.
812
813 If ``c`` is None, the mean is calculated in one pass, and the deviations
814 from the mean are calculated in a second pass. Otherwise, deviations are
815 calculated from ``c`` as given. Use the second case with care, as it can
816 lead to garbage results.
817 """
818 if c is None:
819 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100820 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700821 # The following sum should mathematically equal zero, but due to rounding
822 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100823 U, total2, count2 = _sum((x-c) for x in data)
824 assert T == U and count == count2
825 total -= total2**2/len(data)
826 assert not total < 0, 'negative sum of square deviations: %f' % total
827 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700828
829
830def variance(data, xbar=None):
831 """Return the sample variance of data.
832
833 data should be an iterable of Real-valued numbers, with at least two
834 values. The optional argument xbar, if given, should be the mean of
835 the data. If it is missing or None, the mean is automatically calculated.
836
837 Use this function when your data is a sample from a population. To
838 calculate the variance from the entire population, see ``pvariance``.
839
840 Examples:
841
842 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
843 >>> variance(data)
844 1.3720238095238095
845
846 If you have already calculated the mean of your data, you can pass it as
847 the optional second argument ``xbar`` to avoid recalculating it:
848
849 >>> m = mean(data)
850 >>> variance(data, m)
851 1.3720238095238095
852
853 This function does not check that ``xbar`` is actually the mean of
854 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
855 impossible results.
856
857 Decimals and Fractions are supported:
858
859 >>> from decimal import Decimal as D
860 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
861 Decimal('31.01875')
862
863 >>> from fractions import Fraction as F
864 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
865 Fraction(67, 108)
866
867 """
868 if iter(data) is data:
869 data = list(data)
870 n = len(data)
871 if n < 2:
872 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100873 T, ss = _ss(data, xbar)
874 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700875
876
877def pvariance(data, mu=None):
878 """Return the population variance of ``data``.
879
880 data should be an iterable of Real-valued numbers, with at least one
881 value. The optional argument mu, if given, should be the mean of
882 the data. If it is missing or None, the mean is automatically calculated.
883
884 Use this function to calculate the variance from the entire population.
885 To estimate the variance from a sample, the ``variance`` function is
886 usually a better choice.
887
888 Examples:
889
890 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
891 >>> pvariance(data)
892 1.25
893
894 If you have already calculated the mean of the data, you can pass it as
895 the optional second argument to avoid recalculating it:
896
897 >>> mu = mean(data)
898 >>> pvariance(data, mu)
899 1.25
900
901 This function does not check that ``mu`` is actually the mean of ``data``.
902 Giving arbitrary values for ``mu`` may lead to invalid or impossible
903 results.
904
905 Decimals and Fractions are supported:
906
907 >>> from decimal import Decimal as D
908 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
909 Decimal('24.815')
910
911 >>> from fractions import Fraction as F
912 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
913 Fraction(13, 72)
914
915 """
916 if iter(data) is data:
917 data = list(data)
918 n = len(data)
919 if n < 1:
920 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100921 T, ss = _ss(data, mu)
922 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700923
924
925def stdev(data, xbar=None):
926 """Return the square root of the sample variance.
927
928 See ``variance`` for arguments and other details.
929
930 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
931 1.0810874155219827
932
933 """
934 var = variance(data, xbar)
935 try:
936 return var.sqrt()
937 except AttributeError:
938 return math.sqrt(var)
939
940
941def pstdev(data, mu=None):
942 """Return the square root of the population variance.
943
944 See ``pvariance`` for arguments and other details.
945
946 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
947 0.986893273527251
948
949 """
950 var = pvariance(data, mu)
951 try:
952 return var.sqrt()
953 except AttributeError:
954 return math.sqrt(var)