blob: af5d41e89e7bdb64f2e3159c4213dc1291d48cb6 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001## Module statistics.py
2##
3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5## Licensed under the Apache License, Version 2.0 (the "License");
6## you may not use this file except in compliance with the License.
7## You may obtain a copy of the License at
8##
9## http://www.apache.org/licenses/LICENSE-2.0
10##
11## Unless required by applicable law or agreed to in writing, software
12## distributed under the License is distributed on an "AS IS" BASIS,
13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14## See the License for the specific language governing permissions and
15## limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27================== =============================================
28Function Description
29================== =============================================
30mean Arithmetic mean (average) of data.
31median Median (middle value) of data.
32median_low Low median of data.
33median_high High median of data.
34median_grouped Median, or 50th percentile, of grouped data.
35mode Mode (most common value) of data.
36================== =============================================
37
38Calculate the arithmetic mean ("the average") of data:
39
40>>> mean([-1.0, 2.5, 3.25, 5.75])
412.625
42
43
44Calculate the standard median of discrete data:
45
46>>> median([2, 3, 4, 5])
473.5
48
49
50Calculate the median, or 50th percentile, of data grouped into class intervals
51centred on the data values provided. E.g. if your data points are rounded to
52the nearest whole number:
53
54>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
552.8333333333...
56
57This should be interpreted in this way: you have two data points in the class
58interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
59the class interval 3.5-4.5. The median of these data points is 2.8333...
60
61
62Calculating variability or spread
63---------------------------------
64
65================== =============================================
66Function Description
67================== =============================================
68pvariance Population variance of data.
69variance Sample variance of data.
70pstdev Population standard deviation of data.
71stdev Sample standard deviation of data.
72================== =============================================
73
74Calculate the standard deviation of sample data:
75
76>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
774.38961843444...
78
79If you have previously calculated the mean, you can pass it as the optional
80second argument to the four "spread" functions to avoid recalculating it:
81
82>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
83>>> mu = mean(data)
84>>> pvariance(data, mu)
852.5
86
87
88Exceptions
89----------
90
91A single exception is defined: StatisticsError is a subclass of ValueError.
92
93"""
94
95__all__ = [ 'StatisticsError',
96 'pstdev', 'pvariance', 'stdev', 'variance',
97 'median', 'median_low', 'median_high', 'median_grouped',
98 'mean', 'mode',
99 ]
100
101
102import collections
103import math
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700104
105from fractions import Fraction
106from decimal import Decimal
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107from itertools import groupby
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000108from bisect import bisect_left, bisect_right
Steven D'Apranob28c3272015-12-01 19:59:53 +1100109
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700110
111
112# === Exceptions ===
113
114class StatisticsError(ValueError):
115 pass
116
117
118# === Private utilities ===
119
120def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100121 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700122
Steven D'Apranob28c3272015-12-01 19:59:53 +1100123 Return a high-precision sum of the given numeric data as a fraction,
124 together with the type to be converted to and the count of items.
125
126 If optional argument ``start`` is given, it is added to the total.
127 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700128
129
130 Examples
131 --------
132
133 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100134 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700135
136 Some sources of round-off error will be avoided:
137
138 >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100139 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700140
141 Fractions and Decimals are also supported:
142
143 >>> from fractions import Fraction as F
144 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Steven D'Apranob28c3272015-12-01 19:59:53 +1100145 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700146
147 >>> from decimal import Decimal as D
148 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
149 >>> _sum(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100150 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700151
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000152 Mixed types are currently treated as an error, except that int is
153 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700154 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100155 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700156 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100157 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700158 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100159 T = _coerce(int, type(start))
160 for typ, values in groupby(data, type):
161 T = _coerce(T, typ) # or raise TypeError
162 for n,d in map(_exact_ratio, values):
163 count += 1
164 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700165 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100166 # The sum will be a NAN or INF. We can ignore all the finite
167 # partials, and just look at this special one.
168 total = partials[None]
169 assert not _isfinite(total)
170 else:
171 # Sum all the partial sums using builtin sum.
172 # FIXME is this faster if we sum them in order of the denominator?
173 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
174 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700175
176
Steven D'Apranob28c3272015-12-01 19:59:53 +1100177def _isfinite(x):
178 try:
179 return x.is_finite() # Likely a Decimal.
180 except AttributeError:
181 return math.isfinite(x) # Coerces to float first.
182
183
184def _coerce(T, S):
185 """Coerce types T and S to a common type, or raise TypeError.
186
187 Coercion rules are currently an implementation detail. See the CoerceTest
188 test class in test_statistics for details.
189 """
190 # See http://bugs.python.org/issue24068.
191 assert T is not bool, "initial type T is bool"
192 # If the types are the same, no need to coerce anything. Put this
193 # first, so that the usual case (no coercion needed) happens as soon
194 # as possible.
195 if T is S: return T
196 # Mixed int & other coerce to the other type.
197 if S is int or S is bool: return T
198 if T is int: return S
199 # If one is a (strict) subclass of the other, coerce to the subclass.
200 if issubclass(S, T): return S
201 if issubclass(T, S): return T
202 # Ints coerce to the other type.
203 if issubclass(T, int): return S
204 if issubclass(S, int): return T
205 # Mixed fraction & float coerces to float (or float subclass).
206 if issubclass(T, Fraction) and issubclass(S, float):
207 return S
208 if issubclass(T, float) and issubclass(S, Fraction):
209 return T
210 # Any other combination is disallowed.
211 msg = "don't know how to coerce %s and %s"
212 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000213
214
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700215def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100216 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700217
218 >>> _exact_ratio(0.25)
219 (1, 4)
220
221 x is expected to be an int, Fraction, Decimal or float.
222 """
223 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100224 # Optimise the common case of floats. We expect that the most often
225 # used numeric type will be builtin floats, so try to make this as
226 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000227 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100228 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700229 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100230 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700231 return (x.numerator, x.denominator)
232 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700233 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000234 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700235 return x.as_integer_ratio()
236 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000237 # Just give up?
238 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700239 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100240 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000241 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700242 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100243 msg = "can't convert type '{}' to numerator/denominator"
244 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700245
246
Steven D'Apranob28c3272015-12-01 19:59:53 +1100247def _convert(value, T):
248 """Convert value to given numeric type T."""
249 if type(value) is T:
250 # This covers the cases where T is Fraction, or where value is
251 # a NAN or INF (Decimal or float).
252 return value
253 if issubclass(T, int) and value.denominator != 1:
254 T = float
255 try:
256 # FIXME: what do we do if this overflows?
257 return T(value)
258 except TypeError:
259 if issubclass(T, Decimal):
260 return T(value.numerator)/T(value.denominator)
261 else:
262 raise
263
264
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700265def _counts(data):
266 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000267 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700268 if not table:
269 return table
270 # Extract the values with the highest frequency.
271 maxfreq = table[0][1]
272 for i in range(1, len(table)):
273 if table[i][1] != maxfreq:
274 table = table[:i]
275 break
276 return table
277
278
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000279def _find_lteq(a, x):
280 'Locate the leftmost value exactly equal to x'
281 i = bisect_left(a, x)
282 if i != len(a) and a[i] == x:
283 return i
284 raise ValueError
285
286
287def _find_rteq(a, l, x):
288 'Locate the rightmost value exactly equal to x'
289 i = bisect_right(a, x, lo=l)
290 if i != (len(a)+1) and a[i-1] == x:
291 return i-1
292 raise ValueError
293
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700294# === Measures of central tendency (averages) ===
295
296def mean(data):
297 """Return the sample arithmetic mean of data.
298
299 >>> mean([1, 2, 3, 4, 4])
300 2.8
301
302 >>> from fractions import Fraction as F
303 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
304 Fraction(13, 21)
305
306 >>> from decimal import Decimal as D
307 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
308 Decimal('0.5625')
309
310 If ``data`` is empty, StatisticsError will be raised.
311 """
312 if iter(data) is data:
313 data = list(data)
314 n = len(data)
315 if n < 1:
316 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100317 T, total, count = _sum(data)
318 assert count == n
319 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700320
321
322# FIXME: investigate ways to calculate medians without sorting? Quickselect?
323def median(data):
324 """Return the median (middle value) of numeric data.
325
326 When the number of data points is odd, return the middle data point.
327 When the number of data points is even, the median is interpolated by
328 taking the average of the two middle values:
329
330 >>> median([1, 3, 5])
331 3
332 >>> median([1, 3, 5, 7])
333 4.0
334
335 """
336 data = sorted(data)
337 n = len(data)
338 if n == 0:
339 raise StatisticsError("no median for empty data")
340 if n%2 == 1:
341 return data[n//2]
342 else:
343 i = n//2
344 return (data[i - 1] + data[i])/2
345
346
347def median_low(data):
348 """Return the low median of numeric data.
349
350 When the number of data points is odd, the middle value is returned.
351 When it is even, the smaller of the two middle values is returned.
352
353 >>> median_low([1, 3, 5])
354 3
355 >>> median_low([1, 3, 5, 7])
356 3
357
358 """
359 data = sorted(data)
360 n = len(data)
361 if n == 0:
362 raise StatisticsError("no median for empty data")
363 if n%2 == 1:
364 return data[n//2]
365 else:
366 return data[n//2 - 1]
367
368
369def median_high(data):
370 """Return the high median of data.
371
372 When the number of data points is odd, the middle value is returned.
373 When it is even, the larger of the two middle values is returned.
374
375 >>> median_high([1, 3, 5])
376 3
377 >>> median_high([1, 3, 5, 7])
378 5
379
380 """
381 data = sorted(data)
382 n = len(data)
383 if n == 0:
384 raise StatisticsError("no median for empty data")
385 return data[n//2]
386
387
388def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500389 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700390
391 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
392 3.7
393 >>> median_grouped([52, 52, 53, 54])
394 52.5
395
396 This calculates the median as the 50th percentile, and should be
397 used when your data is continuous and grouped. In the above example,
398 the values 1, 2, 3, etc. actually represent the midpoint of classes
399 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
400 class 3.5-4.5, and interpolation is used to estimate it.
401
402 Optional argument ``interval`` represents the class interval, and
403 defaults to 1. Changing the class interval naturally will change the
404 interpolated 50th percentile value:
405
406 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
407 3.25
408 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
409 3.5
410
411 This function does not check whether the data points are at least
412 ``interval`` apart.
413 """
414 data = sorted(data)
415 n = len(data)
416 if n == 0:
417 raise StatisticsError("no median for empty data")
418 elif n == 1:
419 return data[0]
420 # Find the value at the midpoint. Remember this corresponds to the
421 # centre of the class interval.
422 x = data[n//2]
423 for obj in (x, interval):
424 if isinstance(obj, (str, bytes)):
425 raise TypeError('expected number but got %r' % obj)
426 try:
427 L = x - interval/2 # The lower limit of the median interval.
428 except TypeError:
429 # Mixed type. For now we just coerce to float.
430 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000431
432 # Uses bisection search to search for x in data with log(n) time complexity
433 # Find the position of leftmost occurence of x in data
434 l1 = _find_lteq(data, x)
435 # Find the position of rightmost occurence of x in data[l1...len(data)]
436 # Assuming always l1 <= l2
437 l2 = _find_rteq(data, l1, x)
438 cf = l1
439 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700440 return L + interval*(n/2 - cf)/f
441
442
443def mode(data):
444 """Return the most common data point from discrete or nominal data.
445
446 ``mode`` assumes discrete data, and returns a single value. This is the
447 standard treatment of the mode as commonly taught in schools:
448
449 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
450 3
451
452 This also works with nominal (non-numeric) data:
453
454 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
455 'red'
456
457 If there is not exactly one most common value, ``mode`` will raise
458 StatisticsError.
459 """
460 # Generate a table of sorted (value, frequency) pairs.
461 table = _counts(data)
462 if len(table) == 1:
463 return table[0][0]
464 elif table:
465 raise StatisticsError(
466 'no unique mode; found %d equally common values' % len(table)
467 )
468 else:
469 raise StatisticsError('no mode for empty data')
470
471
472# === Measures of spread ===
473
474# See http://mathworld.wolfram.com/Variance.html
475# http://mathworld.wolfram.com/SampleVariance.html
476# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
477#
478# Under no circumstances use the so-called "computational formula for
479# variance", as that is only suitable for hand calculations with a small
480# amount of low-precision data. It has terrible numeric properties.
481#
482# See a comparison of three computational methods here:
483# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
484
485def _ss(data, c=None):
486 """Return sum of square deviations of sequence data.
487
488 If ``c`` is None, the mean is calculated in one pass, and the deviations
489 from the mean are calculated in a second pass. Otherwise, deviations are
490 calculated from ``c`` as given. Use the second case with care, as it can
491 lead to garbage results.
492 """
493 if c is None:
494 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100495 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700496 # The following sum should mathematically equal zero, but due to rounding
497 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100498 U, total2, count2 = _sum((x-c) for x in data)
499 assert T == U and count == count2
500 total -= total2**2/len(data)
501 assert not total < 0, 'negative sum of square deviations: %f' % total
502 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700503
504
505def variance(data, xbar=None):
506 """Return the sample variance of data.
507
508 data should be an iterable of Real-valued numbers, with at least two
509 values. The optional argument xbar, if given, should be the mean of
510 the data. If it is missing or None, the mean is automatically calculated.
511
512 Use this function when your data is a sample from a population. To
513 calculate the variance from the entire population, see ``pvariance``.
514
515 Examples:
516
517 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
518 >>> variance(data)
519 1.3720238095238095
520
521 If you have already calculated the mean of your data, you can pass it as
522 the optional second argument ``xbar`` to avoid recalculating it:
523
524 >>> m = mean(data)
525 >>> variance(data, m)
526 1.3720238095238095
527
528 This function does not check that ``xbar`` is actually the mean of
529 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
530 impossible results.
531
532 Decimals and Fractions are supported:
533
534 >>> from decimal import Decimal as D
535 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
536 Decimal('31.01875')
537
538 >>> from fractions import Fraction as F
539 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
540 Fraction(67, 108)
541
542 """
543 if iter(data) is data:
544 data = list(data)
545 n = len(data)
546 if n < 2:
547 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100548 T, ss = _ss(data, xbar)
549 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700550
551
552def pvariance(data, mu=None):
553 """Return the population variance of ``data``.
554
555 data should be an iterable of Real-valued numbers, with at least one
556 value. The optional argument mu, if given, should be the mean of
557 the data. If it is missing or None, the mean is automatically calculated.
558
559 Use this function to calculate the variance from the entire population.
560 To estimate the variance from a sample, the ``variance`` function is
561 usually a better choice.
562
563 Examples:
564
565 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
566 >>> pvariance(data)
567 1.25
568
569 If you have already calculated the mean of the data, you can pass it as
570 the optional second argument to avoid recalculating it:
571
572 >>> mu = mean(data)
573 >>> pvariance(data, mu)
574 1.25
575
576 This function does not check that ``mu`` is actually the mean of ``data``.
577 Giving arbitrary values for ``mu`` may lead to invalid or impossible
578 results.
579
580 Decimals and Fractions are supported:
581
582 >>> from decimal import Decimal as D
583 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
584 Decimal('24.815')
585
586 >>> from fractions import Fraction as F
587 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
588 Fraction(13, 72)
589
590 """
591 if iter(data) is data:
592 data = list(data)
593 n = len(data)
594 if n < 1:
595 raise StatisticsError('pvariance requires at least one data point')
596 ss = _ss(data, mu)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100597 T, ss = _ss(data, mu)
598 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700599
600
601def stdev(data, xbar=None):
602 """Return the square root of the sample variance.
603
604 See ``variance`` for arguments and other details.
605
606 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
607 1.0810874155219827
608
609 """
610 var = variance(data, xbar)
611 try:
612 return var.sqrt()
613 except AttributeError:
614 return math.sqrt(var)
615
616
617def pstdev(data, mu=None):
618 """Return the square root of the population variance.
619
620 See ``pvariance`` for arguments and other details.
621
622 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
623 0.986893273527251
624
625 """
626 var = pvariance(data, mu)
627 try:
628 return var.sqrt()
629 except AttributeError:
630 return math.sqrt(var)