blob: 518f54654469edf53b71aa723d9c5e61e9ee687e [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001## Module statistics.py
2##
3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5## Licensed under the Apache License, Version 2.0 (the "License");
6## you may not use this file except in compliance with the License.
7## You may obtain a copy of the License at
8##
9## http://www.apache.org/licenses/LICENSE-2.0
10##
11## Unless required by applicable law or agreed to in writing, software
12## distributed under the License is distributed on an "AS IS" BASIS,
13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14## See the License for the specific language governing permissions and
15## limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27================== =============================================
28Function Description
29================== =============================================
30mean Arithmetic mean (average) of data.
31median Median (middle value) of data.
32median_low Low median of data.
33median_high High median of data.
34median_grouped Median, or 50th percentile, of grouped data.
35mode Mode (most common value) of data.
36================== =============================================
37
38Calculate the arithmetic mean ("the average") of data:
39
40>>> mean([-1.0, 2.5, 3.25, 5.75])
412.625
42
43
44Calculate the standard median of discrete data:
45
46>>> median([2, 3, 4, 5])
473.5
48
49
50Calculate the median, or 50th percentile, of data grouped into class intervals
51centred on the data values provided. E.g. if your data points are rounded to
52the nearest whole number:
53
54>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
552.8333333333...
56
57This should be interpreted in this way: you have two data points in the class
58interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
59the class interval 3.5-4.5. The median of these data points is 2.8333...
60
61
62Calculating variability or spread
63---------------------------------
64
65================== =============================================
66Function Description
67================== =============================================
68pvariance Population variance of data.
69variance Sample variance of data.
70pstdev Population standard deviation of data.
71stdev Sample standard deviation of data.
72================== =============================================
73
74Calculate the standard deviation of sample data:
75
76>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
774.38961843444...
78
79If you have previously calculated the mean, you can pass it as the optional
80second argument to the four "spread" functions to avoid recalculating it:
81
82>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
83>>> mu = mean(data)
84>>> pvariance(data, mu)
852.5
86
87
88Exceptions
89----------
90
91A single exception is defined: StatisticsError is a subclass of ValueError.
92
93"""
94
95__all__ = [ 'StatisticsError',
96 'pstdev', 'pvariance', 'stdev', 'variance',
97 'median', 'median_low', 'median_high', 'median_grouped',
98 'mean', 'mode',
99 ]
100
101
102import collections
103import math
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700104
105from fractions import Fraction
106from decimal import Decimal
Steven D'Apranob28c3272015-12-01 19:59:53 +1100107from itertools import groupby
108
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700109
110
111# === Exceptions ===
112
113class StatisticsError(ValueError):
114 pass
115
116
117# === Private utilities ===
118
119def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100120 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700121
Steven D'Apranob28c3272015-12-01 19:59:53 +1100122 Return a high-precision sum of the given numeric data as a fraction,
123 together with the type to be converted to and the count of items.
124
125 If optional argument ``start`` is given, it is added to the total.
126 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700127
128
129 Examples
130 --------
131
132 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100133 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700134
135 Some sources of round-off error will be avoided:
136
137 >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100138 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700139
140 Fractions and Decimals are also supported:
141
142 >>> from fractions import Fraction as F
143 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Steven D'Apranob28c3272015-12-01 19:59:53 +1100144 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700145
146 >>> from decimal import Decimal as D
147 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
148 >>> _sum(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100149 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700150
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000151 Mixed types are currently treated as an error, except that int is
152 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700153 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100154 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700155 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100156 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700157 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100158 T = _coerce(int, type(start))
159 for typ, values in groupby(data, type):
160 T = _coerce(T, typ) # or raise TypeError
161 for n,d in map(_exact_ratio, values):
162 count += 1
163 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700164 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100165 # The sum will be a NAN or INF. We can ignore all the finite
166 # partials, and just look at this special one.
167 total = partials[None]
168 assert not _isfinite(total)
169 else:
170 # Sum all the partial sums using builtin sum.
171 # FIXME is this faster if we sum them in order of the denominator?
172 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
173 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700174
175
Steven D'Apranob28c3272015-12-01 19:59:53 +1100176def _isfinite(x):
177 try:
178 return x.is_finite() # Likely a Decimal.
179 except AttributeError:
180 return math.isfinite(x) # Coerces to float first.
181
182
183def _coerce(T, S):
184 """Coerce types T and S to a common type, or raise TypeError.
185
186 Coercion rules are currently an implementation detail. See the CoerceTest
187 test class in test_statistics for details.
188 """
189 # See http://bugs.python.org/issue24068.
190 assert T is not bool, "initial type T is bool"
191 # If the types are the same, no need to coerce anything. Put this
192 # first, so that the usual case (no coercion needed) happens as soon
193 # as possible.
194 if T is S: return T
195 # Mixed int & other coerce to the other type.
196 if S is int or S is bool: return T
197 if T is int: return S
198 # If one is a (strict) subclass of the other, coerce to the subclass.
199 if issubclass(S, T): return S
200 if issubclass(T, S): return T
201 # Ints coerce to the other type.
202 if issubclass(T, int): return S
203 if issubclass(S, int): return T
204 # Mixed fraction & float coerces to float (or float subclass).
205 if issubclass(T, Fraction) and issubclass(S, float):
206 return S
207 if issubclass(T, float) and issubclass(S, Fraction):
208 return T
209 # Any other combination is disallowed.
210 msg = "don't know how to coerce %s and %s"
211 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000212
213
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700214def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100215 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700216
217 >>> _exact_ratio(0.25)
218 (1, 4)
219
220 x is expected to be an int, Fraction, Decimal or float.
221 """
222 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100223 # Optimise the common case of floats. We expect that the most often
224 # used numeric type will be builtin floats, so try to make this as
225 # fast as possible.
226 if type(x) is float:
227 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700228 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100229 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700230 return (x.numerator, x.denominator)
231 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100233 # x may be a float subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700234 return x.as_integer_ratio()
235 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700236 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100237 # x may be a Decimal.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700238 return _decimal_to_ratio(x)
239 except AttributeError:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100240 # Just give up?
241 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700242 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100243 # float NAN or INF.
244 assert not math.isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700245 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100246 msg = "can't convert type '{}' to numerator/denominator"
247 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700248
249
250# FIXME This is faster than Fraction.from_decimal, but still too slow.
251def _decimal_to_ratio(d):
252 """Convert Decimal d to exact integer ratio (numerator, denominator).
253
254 >>> from decimal import Decimal
255 >>> _decimal_to_ratio(Decimal("2.6"))
256 (26, 10)
257
258 """
259 sign, digits, exp = d.as_tuple()
260 if exp in ('F', 'n', 'N'): # INF, NAN, sNAN
261 assert not d.is_finite()
Steven D'Apranob28c3272015-12-01 19:59:53 +1100262 return (d, None)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700263 num = 0
264 for digit in digits:
265 num = num*10 + digit
Nick Coghlan4a7668a2014-02-08 23:55:14 +1000266 if exp < 0:
267 den = 10**-exp
268 else:
269 num *= 10**exp
270 den = 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700271 if sign:
272 num = -num
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700273 return (num, den)
274
275
Steven D'Apranob28c3272015-12-01 19:59:53 +1100276def _convert(value, T):
277 """Convert value to given numeric type T."""
278 if type(value) is T:
279 # This covers the cases where T is Fraction, or where value is
280 # a NAN or INF (Decimal or float).
281 return value
282 if issubclass(T, int) and value.denominator != 1:
283 T = float
284 try:
285 # FIXME: what do we do if this overflows?
286 return T(value)
287 except TypeError:
288 if issubclass(T, Decimal):
289 return T(value.numerator)/T(value.denominator)
290 else:
291 raise
292
293
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700294def _counts(data):
295 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000296 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700297 if not table:
298 return table
299 # Extract the values with the highest frequency.
300 maxfreq = table[0][1]
301 for i in range(1, len(table)):
302 if table[i][1] != maxfreq:
303 table = table[:i]
304 break
305 return table
306
307
308# === Measures of central tendency (averages) ===
309
310def mean(data):
311 """Return the sample arithmetic mean of data.
312
313 >>> mean([1, 2, 3, 4, 4])
314 2.8
315
316 >>> from fractions import Fraction as F
317 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
318 Fraction(13, 21)
319
320 >>> from decimal import Decimal as D
321 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
322 Decimal('0.5625')
323
324 If ``data`` is empty, StatisticsError will be raised.
325 """
326 if iter(data) is data:
327 data = list(data)
328 n = len(data)
329 if n < 1:
330 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100331 T, total, count = _sum(data)
332 assert count == n
333 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700334
335
336# FIXME: investigate ways to calculate medians without sorting? Quickselect?
337def median(data):
338 """Return the median (middle value) of numeric data.
339
340 When the number of data points is odd, return the middle data point.
341 When the number of data points is even, the median is interpolated by
342 taking the average of the two middle values:
343
344 >>> median([1, 3, 5])
345 3
346 >>> median([1, 3, 5, 7])
347 4.0
348
349 """
350 data = sorted(data)
351 n = len(data)
352 if n == 0:
353 raise StatisticsError("no median for empty data")
354 if n%2 == 1:
355 return data[n//2]
356 else:
357 i = n//2
358 return (data[i - 1] + data[i])/2
359
360
361def median_low(data):
362 """Return the low median of numeric data.
363
364 When the number of data points is odd, the middle value is returned.
365 When it is even, the smaller of the two middle values is returned.
366
367 >>> median_low([1, 3, 5])
368 3
369 >>> median_low([1, 3, 5, 7])
370 3
371
372 """
373 data = sorted(data)
374 n = len(data)
375 if n == 0:
376 raise StatisticsError("no median for empty data")
377 if n%2 == 1:
378 return data[n//2]
379 else:
380 return data[n//2 - 1]
381
382
383def median_high(data):
384 """Return the high median of data.
385
386 When the number of data points is odd, the middle value is returned.
387 When it is even, the larger of the two middle values is returned.
388
389 >>> median_high([1, 3, 5])
390 3
391 >>> median_high([1, 3, 5, 7])
392 5
393
394 """
395 data = sorted(data)
396 n = len(data)
397 if n == 0:
398 raise StatisticsError("no median for empty data")
399 return data[n//2]
400
401
402def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500403 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700404
405 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
406 3.7
407 >>> median_grouped([52, 52, 53, 54])
408 52.5
409
410 This calculates the median as the 50th percentile, and should be
411 used when your data is continuous and grouped. In the above example,
412 the values 1, 2, 3, etc. actually represent the midpoint of classes
413 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
414 class 3.5-4.5, and interpolation is used to estimate it.
415
416 Optional argument ``interval`` represents the class interval, and
417 defaults to 1. Changing the class interval naturally will change the
418 interpolated 50th percentile value:
419
420 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
421 3.25
422 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
423 3.5
424
425 This function does not check whether the data points are at least
426 ``interval`` apart.
427 """
428 data = sorted(data)
429 n = len(data)
430 if n == 0:
431 raise StatisticsError("no median for empty data")
432 elif n == 1:
433 return data[0]
434 # Find the value at the midpoint. Remember this corresponds to the
435 # centre of the class interval.
436 x = data[n//2]
437 for obj in (x, interval):
438 if isinstance(obj, (str, bytes)):
439 raise TypeError('expected number but got %r' % obj)
440 try:
441 L = x - interval/2 # The lower limit of the median interval.
442 except TypeError:
443 # Mixed type. For now we just coerce to float.
444 L = float(x) - float(interval)/2
445 cf = data.index(x) # Number of values below the median interval.
446 # FIXME The following line could be more efficient for big lists.
447 f = data.count(x) # Number of data points in the median interval.
448 return L + interval*(n/2 - cf)/f
449
450
451def mode(data):
452 """Return the most common data point from discrete or nominal data.
453
454 ``mode`` assumes discrete data, and returns a single value. This is the
455 standard treatment of the mode as commonly taught in schools:
456
457 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
458 3
459
460 This also works with nominal (non-numeric) data:
461
462 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
463 'red'
464
465 If there is not exactly one most common value, ``mode`` will raise
466 StatisticsError.
467 """
468 # Generate a table of sorted (value, frequency) pairs.
469 table = _counts(data)
470 if len(table) == 1:
471 return table[0][0]
472 elif table:
473 raise StatisticsError(
474 'no unique mode; found %d equally common values' % len(table)
475 )
476 else:
477 raise StatisticsError('no mode for empty data')
478
479
480# === Measures of spread ===
481
482# See http://mathworld.wolfram.com/Variance.html
483# http://mathworld.wolfram.com/SampleVariance.html
484# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
485#
486# Under no circumstances use the so-called "computational formula for
487# variance", as that is only suitable for hand calculations with a small
488# amount of low-precision data. It has terrible numeric properties.
489#
490# See a comparison of three computational methods here:
491# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
492
493def _ss(data, c=None):
494 """Return sum of square deviations of sequence data.
495
496 If ``c`` is None, the mean is calculated in one pass, and the deviations
497 from the mean are calculated in a second pass. Otherwise, deviations are
498 calculated from ``c`` as given. Use the second case with care, as it can
499 lead to garbage results.
500 """
501 if c is None:
502 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100503 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700504 # The following sum should mathematically equal zero, but due to rounding
505 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100506 U, total2, count2 = _sum((x-c) for x in data)
507 assert T == U and count == count2
508 total -= total2**2/len(data)
509 assert not total < 0, 'negative sum of square deviations: %f' % total
510 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700511
512
513def variance(data, xbar=None):
514 """Return the sample variance of data.
515
516 data should be an iterable of Real-valued numbers, with at least two
517 values. The optional argument xbar, if given, should be the mean of
518 the data. If it is missing or None, the mean is automatically calculated.
519
520 Use this function when your data is a sample from a population. To
521 calculate the variance from the entire population, see ``pvariance``.
522
523 Examples:
524
525 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
526 >>> variance(data)
527 1.3720238095238095
528
529 If you have already calculated the mean of your data, you can pass it as
530 the optional second argument ``xbar`` to avoid recalculating it:
531
532 >>> m = mean(data)
533 >>> variance(data, m)
534 1.3720238095238095
535
536 This function does not check that ``xbar`` is actually the mean of
537 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
538 impossible results.
539
540 Decimals and Fractions are supported:
541
542 >>> from decimal import Decimal as D
543 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
544 Decimal('31.01875')
545
546 >>> from fractions import Fraction as F
547 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
548 Fraction(67, 108)
549
550 """
551 if iter(data) is data:
552 data = list(data)
553 n = len(data)
554 if n < 2:
555 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100556 T, ss = _ss(data, xbar)
557 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700558
559
560def pvariance(data, mu=None):
561 """Return the population variance of ``data``.
562
563 data should be an iterable of Real-valued numbers, with at least one
564 value. The optional argument mu, if given, should be the mean of
565 the data. If it is missing or None, the mean is automatically calculated.
566
567 Use this function to calculate the variance from the entire population.
568 To estimate the variance from a sample, the ``variance`` function is
569 usually a better choice.
570
571 Examples:
572
573 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
574 >>> pvariance(data)
575 1.25
576
577 If you have already calculated the mean of the data, you can pass it as
578 the optional second argument to avoid recalculating it:
579
580 >>> mu = mean(data)
581 >>> pvariance(data, mu)
582 1.25
583
584 This function does not check that ``mu`` is actually the mean of ``data``.
585 Giving arbitrary values for ``mu`` may lead to invalid or impossible
586 results.
587
588 Decimals and Fractions are supported:
589
590 >>> from decimal import Decimal as D
591 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
592 Decimal('24.815')
593
594 >>> from fractions import Fraction as F
595 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
596 Fraction(13, 72)
597
598 """
599 if iter(data) is data:
600 data = list(data)
601 n = len(data)
602 if n < 1:
603 raise StatisticsError('pvariance requires at least one data point')
604 ss = _ss(data, mu)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100605 T, ss = _ss(data, mu)
606 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700607
608
609def stdev(data, xbar=None):
610 """Return the square root of the sample variance.
611
612 See ``variance`` for arguments and other details.
613
614 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
615 1.0810874155219827
616
617 """
618 var = variance(data, xbar)
619 try:
620 return var.sqrt()
621 except AttributeError:
622 return math.sqrt(var)
623
624
625def pstdev(data, mu=None):
626 """Return the square root of the population variance.
627
628 See ``pvariance`` for arguments and other details.
629
630 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
631 0.986893273527251
632
633 """
634 var = pvariance(data, mu)
635 try:
636 return var.sqrt()
637 except AttributeError:
638 return math.sqrt(var)