blob: a18241ba0434933c8118fc28a4b138720bad36e2 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001## Module statistics.py
2##
3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5## Licensed under the Apache License, Version 2.0 (the "License");
6## you may not use this file except in compliance with the License.
7## You may obtain a copy of the License at
8##
9## http://www.apache.org/licenses/LICENSE-2.0
10##
11## Unless required by applicable law or agreed to in writing, software
12## distributed under the License is distributed on an "AS IS" BASIS,
13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14## See the License for the specific language governing permissions and
15## limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27================== =============================================
28Function Description
29================== =============================================
30mean Arithmetic mean (average) of data.
31median Median (middle value) of data.
32median_low Low median of data.
33median_high High median of data.
34median_grouped Median, or 50th percentile, of grouped data.
35mode Mode (most common value) of data.
36================== =============================================
37
38Calculate the arithmetic mean ("the average") of data:
39
40>>> mean([-1.0, 2.5, 3.25, 5.75])
412.625
42
43
44Calculate the standard median of discrete data:
45
46>>> median([2, 3, 4, 5])
473.5
48
49
50Calculate the median, or 50th percentile, of data grouped into class intervals
51centred on the data values provided. E.g. if your data points are rounded to
52the nearest whole number:
53
54>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
552.8333333333...
56
57This should be interpreted in this way: you have two data points in the class
58interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
59the class interval 3.5-4.5. The median of these data points is 2.8333...
60
61
62Calculating variability or spread
63---------------------------------
64
65================== =============================================
66Function Description
67================== =============================================
68pvariance Population variance of data.
69variance Sample variance of data.
70pstdev Population standard deviation of data.
71stdev Sample standard deviation of data.
72================== =============================================
73
74Calculate the standard deviation of sample data:
75
76>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
774.38961843444...
78
79If you have previously calculated the mean, you can pass it as the optional
80second argument to the four "spread" functions to avoid recalculating it:
81
82>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
83>>> mu = mean(data)
84>>> pvariance(data, mu)
852.5
86
87
88Exceptions
89----------
90
91A single exception is defined: StatisticsError is a subclass of ValueError.
92
93"""
94
95__all__ = [ 'StatisticsError',
96 'pstdev', 'pvariance', 'stdev', 'variance',
97 'median', 'median_low', 'median_high', 'median_grouped',
98 'mean', 'mode',
99 ]
100
101
102import collections
103import math
104import numbers
105import operator
106
107from fractions import Fraction
108from decimal import Decimal
109
110
111# === Exceptions ===
112
113class StatisticsError(ValueError):
114 pass
115
116
117# === Private utilities ===
118
119def _sum(data, start=0):
120 """_sum(data [, start]) -> value
121
122 Return a high-precision sum of the given numeric data. If optional
123 argument ``start`` is given, it is added to the total. If ``data`` is
124 empty, ``start`` (defaulting to 0) is returned.
125
126
127 Examples
128 --------
129
130 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
131 11.0
132
133 Some sources of round-off error will be avoided:
134
135 >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
136 1000.0
137
138 Fractions and Decimals are also supported:
139
140 >>> from fractions import Fraction as F
141 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
142 Fraction(63, 20)
143
144 >>> from decimal import Decimal as D
145 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
146 >>> _sum(data)
147 Decimal('0.6963')
148
149 """
150 n, d = _exact_ratio(start)
151 T = type(start)
152 partials = {d: n} # map {denominator: sum of numerators}
153 # Micro-optimizations.
154 coerce_types = _coerce_types
155 exact_ratio = _exact_ratio
156 partials_get = partials.get
157 # Add numerators for each denominator, and track the "current" type.
158 for x in data:
159 T = _coerce_types(T, type(x))
160 n, d = exact_ratio(x)
161 partials[d] = partials_get(d, 0) + n
162 if None in partials:
163 assert issubclass(T, (float, Decimal))
164 assert not math.isfinite(partials[None])
165 return T(partials[None])
166 total = Fraction()
167 for d, n in sorted(partials.items()):
168 total += Fraction(n, d)
169 if issubclass(T, int):
170 assert total.denominator == 1
171 return T(total.numerator)
172 if issubclass(T, Decimal):
173 return T(total.numerator)/total.denominator
174 return T(total)
175
176
177def _exact_ratio(x):
178 """Convert Real number x exactly to (numerator, denominator) pair.
179
180 >>> _exact_ratio(0.25)
181 (1, 4)
182
183 x is expected to be an int, Fraction, Decimal or float.
184 """
185 try:
186 try:
187 # int, Fraction
188 return (x.numerator, x.denominator)
189 except AttributeError:
190 # float
191 try:
192 return x.as_integer_ratio()
193 except AttributeError:
194 # Decimal
195 try:
196 return _decimal_to_ratio(x)
197 except AttributeError:
198 msg = "can't convert type '{}' to numerator/denominator"
199 raise TypeError(msg.format(type(x).__name__)) from None
200 except (OverflowError, ValueError):
201 # INF or NAN
202 if __debug__:
203 # Decimal signalling NANs cannot be converted to float :-(
204 if isinstance(x, Decimal):
205 assert not x.is_finite()
206 else:
207 assert not math.isfinite(x)
208 return (x, None)
209
210
211# FIXME This is faster than Fraction.from_decimal, but still too slow.
212def _decimal_to_ratio(d):
213 """Convert Decimal d to exact integer ratio (numerator, denominator).
214
215 >>> from decimal import Decimal
216 >>> _decimal_to_ratio(Decimal("2.6"))
217 (26, 10)
218
219 """
220 sign, digits, exp = d.as_tuple()
221 if exp in ('F', 'n', 'N'): # INF, NAN, sNAN
222 assert not d.is_finite()
223 raise ValueError
224 num = 0
225 for digit in digits:
226 num = num*10 + digit
227 if sign:
228 num = -num
229 den = 10**-exp
230 return (num, den)
231
232
233def _coerce_types(T1, T2):
234 """Coerce types T1 and T2 to a common type.
235
236 >>> _coerce_types(int, float)
237 <class 'float'>
238
239 Coercion is performed according to this table, where "N/A" means
240 that a TypeError exception is raised.
241
242 +----------+-----------+-----------+-----------+----------+
243 | | int | Fraction | Decimal | float |
244 +----------+-----------+-----------+-----------+----------+
245 | int | int | Fraction | Decimal | float |
246 | Fraction | Fraction | Fraction | N/A | float |
247 | Decimal | Decimal | N/A | Decimal | float |
248 | float | float | float | float | float |
249 +----------+-----------+-----------+-----------+----------+
250
251 Subclasses trump their parent class; two subclasses of the same
252 base class will be coerced to the second of the two.
253
254 """
255 # Get the common/fast cases out of the way first.
256 if T1 is T2: return T1
257 if T1 is int: return T2
258 if T2 is int: return T1
259 # Subclasses trump their parent class.
260 if issubclass(T2, T1): return T2
261 if issubclass(T1, T2): return T1
262 # Floats trump everything else.
263 if issubclass(T2, float): return T2
264 if issubclass(T1, float): return T1
265 # Subclasses of the same base class give priority to the second.
266 if T1.__base__ is T2.__base__: return T2
267 # Otherwise, just give up.
268 raise TypeError('cannot coerce types %r and %r' % (T1, T2))
269
270
271def _counts(data):
272 # Generate a table of sorted (value, frequency) pairs.
273 if data is None:
274 raise TypeError('None is not iterable')
275 table = collections.Counter(data).most_common()
276 if not table:
277 return table
278 # Extract the values with the highest frequency.
279 maxfreq = table[0][1]
280 for i in range(1, len(table)):
281 if table[i][1] != maxfreq:
282 table = table[:i]
283 break
284 return table
285
286
287# === Measures of central tendency (averages) ===
288
289def mean(data):
290 """Return the sample arithmetic mean of data.
291
292 >>> mean([1, 2, 3, 4, 4])
293 2.8
294
295 >>> from fractions import Fraction as F
296 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
297 Fraction(13, 21)
298
299 >>> from decimal import Decimal as D
300 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
301 Decimal('0.5625')
302
303 If ``data`` is empty, StatisticsError will be raised.
304 """
305 if iter(data) is data:
306 data = list(data)
307 n = len(data)
308 if n < 1:
309 raise StatisticsError('mean requires at least one data point')
310 return _sum(data)/n
311
312
313# FIXME: investigate ways to calculate medians without sorting? Quickselect?
314def median(data):
315 """Return the median (middle value) of numeric data.
316
317 When the number of data points is odd, return the middle data point.
318 When the number of data points is even, the median is interpolated by
319 taking the average of the two middle values:
320
321 >>> median([1, 3, 5])
322 3
323 >>> median([1, 3, 5, 7])
324 4.0
325
326 """
327 data = sorted(data)
328 n = len(data)
329 if n == 0:
330 raise StatisticsError("no median for empty data")
331 if n%2 == 1:
332 return data[n//2]
333 else:
334 i = n//2
335 return (data[i - 1] + data[i])/2
336
337
338def median_low(data):
339 """Return the low median of numeric data.
340
341 When the number of data points is odd, the middle value is returned.
342 When it is even, the smaller of the two middle values is returned.
343
344 >>> median_low([1, 3, 5])
345 3
346 >>> median_low([1, 3, 5, 7])
347 3
348
349 """
350 data = sorted(data)
351 n = len(data)
352 if n == 0:
353 raise StatisticsError("no median for empty data")
354 if n%2 == 1:
355 return data[n//2]
356 else:
357 return data[n//2 - 1]
358
359
360def median_high(data):
361 """Return the high median of data.
362
363 When the number of data points is odd, the middle value is returned.
364 When it is even, the larger of the two middle values is returned.
365
366 >>> median_high([1, 3, 5])
367 3
368 >>> median_high([1, 3, 5, 7])
369 5
370
371 """
372 data = sorted(data)
373 n = len(data)
374 if n == 0:
375 raise StatisticsError("no median for empty data")
376 return data[n//2]
377
378
379def median_grouped(data, interval=1):
380 """"Return the 50th percentile (median) of grouped continuous data.
381
382 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
383 3.7
384 >>> median_grouped([52, 52, 53, 54])
385 52.5
386
387 This calculates the median as the 50th percentile, and should be
388 used when your data is continuous and grouped. In the above example,
389 the values 1, 2, 3, etc. actually represent the midpoint of classes
390 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
391 class 3.5-4.5, and interpolation is used to estimate it.
392
393 Optional argument ``interval`` represents the class interval, and
394 defaults to 1. Changing the class interval naturally will change the
395 interpolated 50th percentile value:
396
397 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
398 3.25
399 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
400 3.5
401
402 This function does not check whether the data points are at least
403 ``interval`` apart.
404 """
405 data = sorted(data)
406 n = len(data)
407 if n == 0:
408 raise StatisticsError("no median for empty data")
409 elif n == 1:
410 return data[0]
411 # Find the value at the midpoint. Remember this corresponds to the
412 # centre of the class interval.
413 x = data[n//2]
414 for obj in (x, interval):
415 if isinstance(obj, (str, bytes)):
416 raise TypeError('expected number but got %r' % obj)
417 try:
418 L = x - interval/2 # The lower limit of the median interval.
419 except TypeError:
420 # Mixed type. For now we just coerce to float.
421 L = float(x) - float(interval)/2
422 cf = data.index(x) # Number of values below the median interval.
423 # FIXME The following line could be more efficient for big lists.
424 f = data.count(x) # Number of data points in the median interval.
425 return L + interval*(n/2 - cf)/f
426
427
428def mode(data):
429 """Return the most common data point from discrete or nominal data.
430
431 ``mode`` assumes discrete data, and returns a single value. This is the
432 standard treatment of the mode as commonly taught in schools:
433
434 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
435 3
436
437 This also works with nominal (non-numeric) data:
438
439 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
440 'red'
441
442 If there is not exactly one most common value, ``mode`` will raise
443 StatisticsError.
444 """
445 # Generate a table of sorted (value, frequency) pairs.
446 table = _counts(data)
447 if len(table) == 1:
448 return table[0][0]
449 elif table:
450 raise StatisticsError(
451 'no unique mode; found %d equally common values' % len(table)
452 )
453 else:
454 raise StatisticsError('no mode for empty data')
455
456
457# === Measures of spread ===
458
459# See http://mathworld.wolfram.com/Variance.html
460# http://mathworld.wolfram.com/SampleVariance.html
461# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
462#
463# Under no circumstances use the so-called "computational formula for
464# variance", as that is only suitable for hand calculations with a small
465# amount of low-precision data. It has terrible numeric properties.
466#
467# See a comparison of three computational methods here:
468# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
469
470def _ss(data, c=None):
471 """Return sum of square deviations of sequence data.
472
473 If ``c`` is None, the mean is calculated in one pass, and the deviations
474 from the mean are calculated in a second pass. Otherwise, deviations are
475 calculated from ``c`` as given. Use the second case with care, as it can
476 lead to garbage results.
477 """
478 if c is None:
479 c = mean(data)
480 ss = _sum((x-c)**2 for x in data)
481 # The following sum should mathematically equal zero, but due to rounding
482 # error may not.
483 ss -= _sum((x-c) for x in data)**2/len(data)
484 assert not ss < 0, 'negative sum of square deviations: %f' % ss
485 return ss
486
487
488def variance(data, xbar=None):
489 """Return the sample variance of data.
490
491 data should be an iterable of Real-valued numbers, with at least two
492 values. The optional argument xbar, if given, should be the mean of
493 the data. If it is missing or None, the mean is automatically calculated.
494
495 Use this function when your data is a sample from a population. To
496 calculate the variance from the entire population, see ``pvariance``.
497
498 Examples:
499
500 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
501 >>> variance(data)
502 1.3720238095238095
503
504 If you have already calculated the mean of your data, you can pass it as
505 the optional second argument ``xbar`` to avoid recalculating it:
506
507 >>> m = mean(data)
508 >>> variance(data, m)
509 1.3720238095238095
510
511 This function does not check that ``xbar`` is actually the mean of
512 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
513 impossible results.
514
515 Decimals and Fractions are supported:
516
517 >>> from decimal import Decimal as D
518 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
519 Decimal('31.01875')
520
521 >>> from fractions import Fraction as F
522 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
523 Fraction(67, 108)
524
525 """
526 if iter(data) is data:
527 data = list(data)
528 n = len(data)
529 if n < 2:
530 raise StatisticsError('variance requires at least two data points')
531 ss = _ss(data, xbar)
532 return ss/(n-1)
533
534
535def pvariance(data, mu=None):
536 """Return the population variance of ``data``.
537
538 data should be an iterable of Real-valued numbers, with at least one
539 value. The optional argument mu, if given, should be the mean of
540 the data. If it is missing or None, the mean is automatically calculated.
541
542 Use this function to calculate the variance from the entire population.
543 To estimate the variance from a sample, the ``variance`` function is
544 usually a better choice.
545
546 Examples:
547
548 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
549 >>> pvariance(data)
550 1.25
551
552 If you have already calculated the mean of the data, you can pass it as
553 the optional second argument to avoid recalculating it:
554
555 >>> mu = mean(data)
556 >>> pvariance(data, mu)
557 1.25
558
559 This function does not check that ``mu`` is actually the mean of ``data``.
560 Giving arbitrary values for ``mu`` may lead to invalid or impossible
561 results.
562
563 Decimals and Fractions are supported:
564
565 >>> from decimal import Decimal as D
566 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
567 Decimal('24.815')
568
569 >>> from fractions import Fraction as F
570 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
571 Fraction(13, 72)
572
573 """
574 if iter(data) is data:
575 data = list(data)
576 n = len(data)
577 if n < 1:
578 raise StatisticsError('pvariance requires at least one data point')
579 ss = _ss(data, mu)
580 return ss/n
581
582
583def stdev(data, xbar=None):
584 """Return the square root of the sample variance.
585
586 See ``variance`` for arguments and other details.
587
588 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
589 1.0810874155219827
590
591 """
592 var = variance(data, xbar)
593 try:
594 return var.sqrt()
595 except AttributeError:
596 return math.sqrt(var)
597
598
599def pstdev(data, mu=None):
600 """Return the square root of the population variance.
601
602 See ``pvariance`` for arguments and other details.
603
604 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
605 0.986893273527251
606
607 """
608 var = pvariance(data, mu)
609 try:
610 return var.sqrt()
611 except AttributeError:
612 return math.sqrt(var)