blob: 8c41dd346332188748927b9586af0e70909b5f57 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001## Module statistics.py
2##
3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5## Licensed under the Apache License, Version 2.0 (the "License");
6## you may not use this file except in compliance with the License.
7## You may obtain a copy of the License at
8##
9## http://www.apache.org/licenses/LICENSE-2.0
10##
11## Unless required by applicable law or agreed to in writing, software
12## distributed under the License is distributed on an "AS IS" BASIS,
13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14## See the License for the specific language governing permissions and
15## limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27================== =============================================
28Function Description
29================== =============================================
30mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100031harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070032median Median (middle value) of data.
33median_low Low median of data.
34median_high High median of data.
35median_grouped Median, or 50th percentile, of grouped data.
36mode Mode (most common value) of data.
37================== =============================================
38
39Calculate the arithmetic mean ("the average") of data:
40
41>>> mean([-1.0, 2.5, 3.25, 5.75])
422.625
43
44
45Calculate the standard median of discrete data:
46
47>>> median([2, 3, 4, 5])
483.5
49
50
51Calculate the median, or 50th percentile, of data grouped into class intervals
52centred on the data values provided. E.g. if your data points are rounded to
53the nearest whole number:
54
55>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
562.8333333333...
57
58This should be interpreted in this way: you have two data points in the class
59interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
60the class interval 3.5-4.5. The median of these data points is 2.8333...
61
62
63Calculating variability or spread
64---------------------------------
65
66================== =============================================
67Function Description
68================== =============================================
69pvariance Population variance of data.
70variance Sample variance of data.
71pstdev Population standard deviation of data.
72stdev Sample standard deviation of data.
73================== =============================================
74
75Calculate the standard deviation of sample data:
76
77>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
784.38961843444...
79
80If you have previously calculated the mean, you can pass it as the optional
81second argument to the four "spread" functions to avoid recalculating it:
82
83>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
84>>> mu = mean(data)
85>>> pvariance(data, mu)
862.5
87
88
89Exceptions
90----------
91
92A single exception is defined: StatisticsError is a subclass of ValueError.
93
94"""
95
96__all__ = [ 'StatisticsError',
97 'pstdev', 'pvariance', 'stdev', 'variance',
98 'median', 'median_low', 'median_high', 'median_grouped',
Steven D'Apranoa474afd2016-08-09 12:49:01 +100099 'mean', 'mode', 'harmonic_mean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700100 ]
101
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700102import collections
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000103import decimal
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700104import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000105import numbers
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700106
107from fractions import Fraction
108from decimal import Decimal
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000109from itertools import groupby, chain
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000110from bisect import bisect_left, bisect_right
Steven D'Apranob28c3272015-12-01 19:59:53 +1100111
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700112
113
114# === Exceptions ===
115
116class StatisticsError(ValueError):
117 pass
118
119
120# === Private utilities ===
121
122def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100123 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700124
Steven D'Apranob28c3272015-12-01 19:59:53 +1100125 Return a high-precision sum of the given numeric data as a fraction,
126 together with the type to be converted to and the count of items.
127
128 If optional argument ``start`` is given, it is added to the total.
129 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700130
131
132 Examples
133 --------
134
135 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700136 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700137
138 Some sources of round-off error will be avoided:
139
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000140 # Built-in sum returns zero.
141 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700142 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700143
144 Fractions and Decimals are also supported:
145
146 >>> from fractions import Fraction as F
147 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700148 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700149
150 >>> from decimal import Decimal as D
151 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
152 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700153 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700154
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000155 Mixed types are currently treated as an error, except that int is
156 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700157 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100158 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700159 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100160 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700161 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100162 T = _coerce(int, type(start))
163 for typ, values in groupby(data, type):
164 T = _coerce(T, typ) # or raise TypeError
165 for n,d in map(_exact_ratio, values):
166 count += 1
167 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700168 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100169 # The sum will be a NAN or INF. We can ignore all the finite
170 # partials, and just look at this special one.
171 total = partials[None]
172 assert not _isfinite(total)
173 else:
174 # Sum all the partial sums using builtin sum.
175 # FIXME is this faster if we sum them in order of the denominator?
176 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
177 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700178
179
Steven D'Apranob28c3272015-12-01 19:59:53 +1100180def _isfinite(x):
181 try:
182 return x.is_finite() # Likely a Decimal.
183 except AttributeError:
184 return math.isfinite(x) # Coerces to float first.
185
186
187def _coerce(T, S):
188 """Coerce types T and S to a common type, or raise TypeError.
189
190 Coercion rules are currently an implementation detail. See the CoerceTest
191 test class in test_statistics for details.
192 """
193 # See http://bugs.python.org/issue24068.
194 assert T is not bool, "initial type T is bool"
195 # If the types are the same, no need to coerce anything. Put this
196 # first, so that the usual case (no coercion needed) happens as soon
197 # as possible.
198 if T is S: return T
199 # Mixed int & other coerce to the other type.
200 if S is int or S is bool: return T
201 if T is int: return S
202 # If one is a (strict) subclass of the other, coerce to the subclass.
203 if issubclass(S, T): return S
204 if issubclass(T, S): return T
205 # Ints coerce to the other type.
206 if issubclass(T, int): return S
207 if issubclass(S, int): return T
208 # Mixed fraction & float coerces to float (or float subclass).
209 if issubclass(T, Fraction) and issubclass(S, float):
210 return S
211 if issubclass(T, float) and issubclass(S, Fraction):
212 return T
213 # Any other combination is disallowed.
214 msg = "don't know how to coerce %s and %s"
215 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000216
217
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700218def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100219 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700220
221 >>> _exact_ratio(0.25)
222 (1, 4)
223
224 x is expected to be an int, Fraction, Decimal or float.
225 """
226 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100227 # Optimise the common case of floats. We expect that the most often
228 # used numeric type will be builtin floats, so try to make this as
229 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000230 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100231 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700232 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100233 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700234 return (x.numerator, x.denominator)
235 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700236 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000237 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700238 return x.as_integer_ratio()
239 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000240 # Just give up?
241 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700242 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100243 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000244 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700245 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100246 msg = "can't convert type '{}' to numerator/denominator"
247 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700248
249
Steven D'Apranob28c3272015-12-01 19:59:53 +1100250def _convert(value, T):
251 """Convert value to given numeric type T."""
252 if type(value) is T:
253 # This covers the cases where T is Fraction, or where value is
254 # a NAN or INF (Decimal or float).
255 return value
256 if issubclass(T, int) and value.denominator != 1:
257 T = float
258 try:
259 # FIXME: what do we do if this overflows?
260 return T(value)
261 except TypeError:
262 if issubclass(T, Decimal):
263 return T(value.numerator)/T(value.denominator)
264 else:
265 raise
266
267
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700268def _counts(data):
269 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000270 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700271 if not table:
272 return table
273 # Extract the values with the highest frequency.
274 maxfreq = table[0][1]
275 for i in range(1, len(table)):
276 if table[i][1] != maxfreq:
277 table = table[:i]
278 break
279 return table
280
281
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000282def _find_lteq(a, x):
283 'Locate the leftmost value exactly equal to x'
284 i = bisect_left(a, x)
285 if i != len(a) and a[i] == x:
286 return i
287 raise ValueError
288
289
290def _find_rteq(a, l, x):
291 'Locate the rightmost value exactly equal to x'
292 i = bisect_right(a, x, lo=l)
293 if i != (len(a)+1) and a[i-1] == x:
294 return i-1
295 raise ValueError
296
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000297
298def _fail_neg(values, errmsg='negative value'):
299 """Iterate over values, failing if any are less than zero."""
300 for x in values:
301 if x < 0:
302 raise StatisticsError(errmsg)
303 yield x
304
305
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700306# === Measures of central tendency (averages) ===
307
308def mean(data):
309 """Return the sample arithmetic mean of data.
310
311 >>> mean([1, 2, 3, 4, 4])
312 2.8
313
314 >>> from fractions import Fraction as F
315 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
316 Fraction(13, 21)
317
318 >>> from decimal import Decimal as D
319 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
320 Decimal('0.5625')
321
322 If ``data`` is empty, StatisticsError will be raised.
323 """
324 if iter(data) is data:
325 data = list(data)
326 n = len(data)
327 if n < 1:
328 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100329 T, total, count = _sum(data)
330 assert count == n
331 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700332
333
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000334def harmonic_mean(data):
335 """Return the harmonic mean of data.
336
337 The harmonic mean, sometimes called the subcontrary mean, is the
338 reciprocal of the arithmetic mean of the reciprocals of the data,
339 and is often appropriate when averaging quantities which are rates
340 or ratios, for example speeds. Example:
341
342 Suppose an investor purchases an equal value of shares in each of
343 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
344 What is the average P/E ratio for the investor's portfolio?
345
346 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
347 3.6
348
349 Using the arithmetic mean would give an average of about 5.167, which
350 is too high.
351
352 If ``data`` is empty, or any element is less than zero,
353 ``harmonic_mean`` will raise ``StatisticsError``.
354 """
355 # For a justification for using harmonic mean for P/E ratios, see
356 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
357 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
358 if iter(data) is data:
359 data = list(data)
360 errmsg = 'harmonic mean does not support negative values'
361 n = len(data)
362 if n < 1:
363 raise StatisticsError('harmonic_mean requires at least one data point')
364 elif n == 1:
365 x = data[0]
366 if isinstance(x, (numbers.Real, Decimal)):
367 if x < 0:
368 raise StatisticsError(errmsg)
369 return x
370 else:
371 raise TypeError('unsupported type')
372 try:
373 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
374 except ZeroDivisionError:
375 return 0
376 assert count == n
377 return _convert(n/total, T)
378
379
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700380# FIXME: investigate ways to calculate medians without sorting? Quickselect?
381def median(data):
382 """Return the median (middle value) of numeric data.
383
384 When the number of data points is odd, return the middle data point.
385 When the number of data points is even, the median is interpolated by
386 taking the average of the two middle values:
387
388 >>> median([1, 3, 5])
389 3
390 >>> median([1, 3, 5, 7])
391 4.0
392
393 """
394 data = sorted(data)
395 n = len(data)
396 if n == 0:
397 raise StatisticsError("no median for empty data")
398 if n%2 == 1:
399 return data[n//2]
400 else:
401 i = n//2
402 return (data[i - 1] + data[i])/2
403
404
405def median_low(data):
406 """Return the low median of numeric data.
407
408 When the number of data points is odd, the middle value is returned.
409 When it is even, the smaller of the two middle values is returned.
410
411 >>> median_low([1, 3, 5])
412 3
413 >>> median_low([1, 3, 5, 7])
414 3
415
416 """
417 data = sorted(data)
418 n = len(data)
419 if n == 0:
420 raise StatisticsError("no median for empty data")
421 if n%2 == 1:
422 return data[n//2]
423 else:
424 return data[n//2 - 1]
425
426
427def median_high(data):
428 """Return the high median of data.
429
430 When the number of data points is odd, the middle value is returned.
431 When it is even, the larger of the two middle values is returned.
432
433 >>> median_high([1, 3, 5])
434 3
435 >>> median_high([1, 3, 5, 7])
436 5
437
438 """
439 data = sorted(data)
440 n = len(data)
441 if n == 0:
442 raise StatisticsError("no median for empty data")
443 return data[n//2]
444
445
446def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500447 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700448
449 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
450 3.7
451 >>> median_grouped([52, 52, 53, 54])
452 52.5
453
454 This calculates the median as the 50th percentile, and should be
455 used when your data is continuous and grouped. In the above example,
456 the values 1, 2, 3, etc. actually represent the midpoint of classes
457 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
458 class 3.5-4.5, and interpolation is used to estimate it.
459
460 Optional argument ``interval`` represents the class interval, and
461 defaults to 1. Changing the class interval naturally will change the
462 interpolated 50th percentile value:
463
464 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
465 3.25
466 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
467 3.5
468
469 This function does not check whether the data points are at least
470 ``interval`` apart.
471 """
472 data = sorted(data)
473 n = len(data)
474 if n == 0:
475 raise StatisticsError("no median for empty data")
476 elif n == 1:
477 return data[0]
478 # Find the value at the midpoint. Remember this corresponds to the
479 # centre of the class interval.
480 x = data[n//2]
481 for obj in (x, interval):
482 if isinstance(obj, (str, bytes)):
483 raise TypeError('expected number but got %r' % obj)
484 try:
485 L = x - interval/2 # The lower limit of the median interval.
486 except TypeError:
487 # Mixed type. For now we just coerce to float.
488 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000489
490 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000491 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000492 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000493 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000494 # Assuming always l1 <= l2
495 l2 = _find_rteq(data, l1, x)
496 cf = l1
497 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700498 return L + interval*(n/2 - cf)/f
499
500
501def mode(data):
502 """Return the most common data point from discrete or nominal data.
503
504 ``mode`` assumes discrete data, and returns a single value. This is the
505 standard treatment of the mode as commonly taught in schools:
506
507 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
508 3
509
510 This also works with nominal (non-numeric) data:
511
512 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
513 'red'
514
515 If there is not exactly one most common value, ``mode`` will raise
516 StatisticsError.
517 """
518 # Generate a table of sorted (value, frequency) pairs.
519 table = _counts(data)
520 if len(table) == 1:
521 return table[0][0]
522 elif table:
523 raise StatisticsError(
524 'no unique mode; found %d equally common values' % len(table)
525 )
526 else:
527 raise StatisticsError('no mode for empty data')
528
529
530# === Measures of spread ===
531
532# See http://mathworld.wolfram.com/Variance.html
533# http://mathworld.wolfram.com/SampleVariance.html
534# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
535#
536# Under no circumstances use the so-called "computational formula for
537# variance", as that is only suitable for hand calculations with a small
538# amount of low-precision data. It has terrible numeric properties.
539#
540# See a comparison of three computational methods here:
541# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
542
543def _ss(data, c=None):
544 """Return sum of square deviations of sequence data.
545
546 If ``c`` is None, the mean is calculated in one pass, and the deviations
547 from the mean are calculated in a second pass. Otherwise, deviations are
548 calculated from ``c`` as given. Use the second case with care, as it can
549 lead to garbage results.
550 """
551 if c is None:
552 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100553 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700554 # The following sum should mathematically equal zero, but due to rounding
555 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100556 U, total2, count2 = _sum((x-c) for x in data)
557 assert T == U and count == count2
558 total -= total2**2/len(data)
559 assert not total < 0, 'negative sum of square deviations: %f' % total
560 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700561
562
563def variance(data, xbar=None):
564 """Return the sample variance of data.
565
566 data should be an iterable of Real-valued numbers, with at least two
567 values. The optional argument xbar, if given, should be the mean of
568 the data. If it is missing or None, the mean is automatically calculated.
569
570 Use this function when your data is a sample from a population. To
571 calculate the variance from the entire population, see ``pvariance``.
572
573 Examples:
574
575 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
576 >>> variance(data)
577 1.3720238095238095
578
579 If you have already calculated the mean of your data, you can pass it as
580 the optional second argument ``xbar`` to avoid recalculating it:
581
582 >>> m = mean(data)
583 >>> variance(data, m)
584 1.3720238095238095
585
586 This function does not check that ``xbar`` is actually the mean of
587 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
588 impossible results.
589
590 Decimals and Fractions are supported:
591
592 >>> from decimal import Decimal as D
593 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
594 Decimal('31.01875')
595
596 >>> from fractions import Fraction as F
597 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
598 Fraction(67, 108)
599
600 """
601 if iter(data) is data:
602 data = list(data)
603 n = len(data)
604 if n < 2:
605 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100606 T, ss = _ss(data, xbar)
607 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700608
609
610def pvariance(data, mu=None):
611 """Return the population variance of ``data``.
612
613 data should be an iterable of Real-valued numbers, with at least one
614 value. The optional argument mu, if given, should be the mean of
615 the data. If it is missing or None, the mean is automatically calculated.
616
617 Use this function to calculate the variance from the entire population.
618 To estimate the variance from a sample, the ``variance`` function is
619 usually a better choice.
620
621 Examples:
622
623 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
624 >>> pvariance(data)
625 1.25
626
627 If you have already calculated the mean of the data, you can pass it as
628 the optional second argument to avoid recalculating it:
629
630 >>> mu = mean(data)
631 >>> pvariance(data, mu)
632 1.25
633
634 This function does not check that ``mu`` is actually the mean of ``data``.
635 Giving arbitrary values for ``mu`` may lead to invalid or impossible
636 results.
637
638 Decimals and Fractions are supported:
639
640 >>> from decimal import Decimal as D
641 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
642 Decimal('24.815')
643
644 >>> from fractions import Fraction as F
645 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
646 Fraction(13, 72)
647
648 """
649 if iter(data) is data:
650 data = list(data)
651 n = len(data)
652 if n < 1:
653 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100654 T, ss = _ss(data, mu)
655 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700656
657
658def stdev(data, xbar=None):
659 """Return the square root of the sample variance.
660
661 See ``variance`` for arguments and other details.
662
663 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
664 1.0810874155219827
665
666 """
667 var = variance(data, xbar)
668 try:
669 return var.sqrt()
670 except AttributeError:
671 return math.sqrt(var)
672
673
674def pstdev(data, mu=None):
675 """Return the square root of the population variance.
676
677 See ``pvariance`` for arguments and other details.
678
679 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
680 0.986893273527251
681
682 """
683 var = pvariance(data, mu)
684 try:
685 return var.sqrt()
686 except AttributeError:
687 return math.sqrt(var)