blob: 30fe55c86a5e85e226593dfa6b5de552333edc70 [file] [log] [blame]
Larry Hastingsf5e987b2013-10-19 11:50:09 -07001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== =============================================
11Function Description
12================== =============================================
13mean Arithmetic mean (average) of data.
Steven D'Apranoa474afd2016-08-09 12:49:01 +100014harmonic_mean Harmonic mean of data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -070015median Median (middle value) of data.
16median_low Low median of data.
17median_high High median of data.
18median_grouped Median, or 50th percentile, of grouped data.
19mode Mode (most common value) of data.
20================== =============================================
21
22Calculate the arithmetic mean ("the average") of data:
23
24>>> mean([-1.0, 2.5, 3.25, 5.75])
252.625
26
27
28Calculate the standard median of discrete data:
29
30>>> median([2, 3, 4, 5])
313.5
32
33
34Calculate the median, or 50th percentile, of data grouped into class intervals
35centred on the data values provided. E.g. if your data points are rounded to
36the nearest whole number:
37
38>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
392.8333333333...
40
41This should be interpreted in this way: you have two data points in the class
42interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
43the class interval 3.5-4.5. The median of these data points is 2.8333...
44
45
46Calculating variability or spread
47---------------------------------
48
49================== =============================================
50Function Description
51================== =============================================
52pvariance Population variance of data.
53variance Sample variance of data.
54pstdev Population standard deviation of data.
55stdev Sample standard deviation of data.
56================== =============================================
57
58Calculate the standard deviation of sample data:
59
60>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
614.38961843444...
62
63If you have previously calculated the mean, you can pass it as the optional
64second argument to the four "spread" functions to avoid recalculating it:
65
66>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
67>>> mu = mean(data)
68>>> pvariance(data, mu)
692.5
70
71
72Exceptions
73----------
74
75A single exception is defined: StatisticsError is a subclass of ValueError.
76
77"""
78
79__all__ = [ 'StatisticsError',
80 'pstdev', 'pvariance', 'stdev', 'variance',
81 'median', 'median_low', 'median_high', 'median_grouped',
Steven D'Apranofae28292016-10-05 03:24:45 +110082 'mean', 'mode', 'harmonic_mean',
Larry Hastingsf5e987b2013-10-19 11:50:09 -070083 ]
84
Larry Hastingsf5e987b2013-10-19 11:50:09 -070085import collections
Steven D'Apranoa474afd2016-08-09 12:49:01 +100086import decimal
Larry Hastingsf5e987b2013-10-19 11:50:09 -070087import math
Steven D'Apranoa474afd2016-08-09 12:49:01 +100088import numbers
Larry Hastingsf5e987b2013-10-19 11:50:09 -070089
90from fractions import Fraction
91from decimal import Decimal
Steven D'Apranoa474afd2016-08-09 12:49:01 +100092from itertools import groupby, chain
Steven D'Aprano3b06e242016-05-05 03:54:29 +100093from bisect import bisect_left, bisect_right
Steven D'Apranob28c3272015-12-01 19:59:53 +110094
Larry Hastingsf5e987b2013-10-19 11:50:09 -070095
96
97# === Exceptions ===
98
99class StatisticsError(ValueError):
100 pass
101
102
103# === Private utilities ===
104
105def _sum(data, start=0):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100106 """_sum(data [, start]) -> (type, sum, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700107
Steven D'Apranob28c3272015-12-01 19:59:53 +1100108 Return a high-precision sum of the given numeric data as a fraction,
109 together with the type to be converted to and the count of items.
110
111 If optional argument ``start`` is given, it is added to the total.
112 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700113
114
115 Examples
116 --------
117
118 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700119 (<class 'float'>, Fraction(11, 1), 5)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700120
121 Some sources of round-off error will be avoided:
122
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000123 # Built-in sum returns zero.
124 >>> _sum([1e50, 1, -1e50] * 1000)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700125 (<class 'float'>, Fraction(1000, 1), 3000)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700126
127 Fractions and Decimals are also supported:
128
129 >>> from fractions import Fraction as F
130 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
Benjamin Petersonab078e92016-07-13 21:13:29 -0700131 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700132
133 >>> from decimal import Decimal as D
134 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
135 >>> _sum(data)
Benjamin Petersonab078e92016-07-13 21:13:29 -0700136 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700137
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000138 Mixed types are currently treated as an error, except that int is
139 allowed.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700140 """
Steven D'Apranob28c3272015-12-01 19:59:53 +1100141 count = 0
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700142 n, d = _exact_ratio(start)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100143 partials = {d: n}
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700144 partials_get = partials.get
Steven D'Apranob28c3272015-12-01 19:59:53 +1100145 T = _coerce(int, type(start))
146 for typ, values in groupby(data, type):
147 T = _coerce(T, typ) # or raise TypeError
148 for n,d in map(_exact_ratio, values):
149 count += 1
150 partials[d] = partials_get(d, 0) + n
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700151 if None in partials:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100152 # The sum will be a NAN or INF. We can ignore all the finite
153 # partials, and just look at this special one.
154 total = partials[None]
155 assert not _isfinite(total)
156 else:
157 # Sum all the partial sums using builtin sum.
158 # FIXME is this faster if we sum them in order of the denominator?
159 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
160 return (T, total, count)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700161
162
Steven D'Apranob28c3272015-12-01 19:59:53 +1100163def _isfinite(x):
164 try:
165 return x.is_finite() # Likely a Decimal.
166 except AttributeError:
167 return math.isfinite(x) # Coerces to float first.
168
169
170def _coerce(T, S):
171 """Coerce types T and S to a common type, or raise TypeError.
172
173 Coercion rules are currently an implementation detail. See the CoerceTest
174 test class in test_statistics for details.
175 """
176 # See http://bugs.python.org/issue24068.
177 assert T is not bool, "initial type T is bool"
178 # If the types are the same, no need to coerce anything. Put this
179 # first, so that the usual case (no coercion needed) happens as soon
180 # as possible.
181 if T is S: return T
182 # Mixed int & other coerce to the other type.
183 if S is int or S is bool: return T
184 if T is int: return S
185 # If one is a (strict) subclass of the other, coerce to the subclass.
186 if issubclass(S, T): return S
187 if issubclass(T, S): return T
188 # Ints coerce to the other type.
189 if issubclass(T, int): return S
190 if issubclass(S, int): return T
191 # Mixed fraction & float coerces to float (or float subclass).
192 if issubclass(T, Fraction) and issubclass(S, float):
193 return S
194 if issubclass(T, float) and issubclass(S, Fraction):
195 return T
196 # Any other combination is disallowed.
197 msg = "don't know how to coerce %s and %s"
198 raise TypeError(msg % (T.__name__, S.__name__))
Nick Coghlan73afe2a2014-02-08 19:58:04 +1000199
200
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700201def _exact_ratio(x):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100202 """Return Real number x to exact (numerator, denominator) pair.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700203
204 >>> _exact_ratio(0.25)
205 (1, 4)
206
207 x is expected to be an int, Fraction, Decimal or float.
208 """
209 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100210 # Optimise the common case of floats. We expect that the most often
211 # used numeric type will be builtin floats, so try to make this as
212 # fast as possible.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000213 if type(x) is float or type(x) is Decimal:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100214 return x.as_integer_ratio()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700215 try:
Steven D'Apranob28c3272015-12-01 19:59:53 +1100216 # x may be an int, Fraction, or Integral ABC.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700217 return (x.numerator, x.denominator)
218 except AttributeError:
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700219 try:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000220 # x may be a float or Decimal subclass.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700221 return x.as_integer_ratio()
222 except AttributeError:
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000223 # Just give up?
224 pass
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700225 except (OverflowError, ValueError):
Steven D'Apranob28c3272015-12-01 19:59:53 +1100226 # float NAN or INF.
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000227 assert not _isfinite(x)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700228 return (x, None)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100229 msg = "can't convert type '{}' to numerator/denominator"
230 raise TypeError(msg.format(type(x).__name__))
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700231
232
Steven D'Apranob28c3272015-12-01 19:59:53 +1100233def _convert(value, T):
234 """Convert value to given numeric type T."""
235 if type(value) is T:
236 # This covers the cases where T is Fraction, or where value is
237 # a NAN or INF (Decimal or float).
238 return value
239 if issubclass(T, int) and value.denominator != 1:
240 T = float
241 try:
242 # FIXME: what do we do if this overflows?
243 return T(value)
244 except TypeError:
245 if issubclass(T, Decimal):
246 return T(value.numerator)/T(value.denominator)
247 else:
248 raise
249
250
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700251def _counts(data):
252 # Generate a table of sorted (value, frequency) pairs.
Nick Coghlanbfd68bf2014-02-08 19:44:16 +1000253 table = collections.Counter(iter(data)).most_common()
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700254 if not table:
255 return table
256 # Extract the values with the highest frequency.
257 maxfreq = table[0][1]
258 for i in range(1, len(table)):
259 if table[i][1] != maxfreq:
260 table = table[:i]
261 break
262 return table
263
264
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000265def _find_lteq(a, x):
266 'Locate the leftmost value exactly equal to x'
267 i = bisect_left(a, x)
268 if i != len(a) and a[i] == x:
269 return i
270 raise ValueError
271
272
273def _find_rteq(a, l, x):
274 'Locate the rightmost value exactly equal to x'
275 i = bisect_right(a, x, lo=l)
276 if i != (len(a)+1) and a[i-1] == x:
277 return i-1
278 raise ValueError
279
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000280
281def _fail_neg(values, errmsg='negative value'):
282 """Iterate over values, failing if any are less than zero."""
283 for x in values:
284 if x < 0:
285 raise StatisticsError(errmsg)
286 yield x
287
288
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700289# === Measures of central tendency (averages) ===
290
291def mean(data):
292 """Return the sample arithmetic mean of data.
293
294 >>> mean([1, 2, 3, 4, 4])
295 2.8
296
297 >>> from fractions import Fraction as F
298 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
299 Fraction(13, 21)
300
301 >>> from decimal import Decimal as D
302 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
303 Decimal('0.5625')
304
305 If ``data`` is empty, StatisticsError will be raised.
306 """
307 if iter(data) is data:
308 data = list(data)
309 n = len(data)
310 if n < 1:
311 raise StatisticsError('mean requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100312 T, total, count = _sum(data)
313 assert count == n
314 return _convert(total/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700315
316
Steven D'Apranoa474afd2016-08-09 12:49:01 +1000317def harmonic_mean(data):
318 """Return the harmonic mean of data.
319
320 The harmonic mean, sometimes called the subcontrary mean, is the
321 reciprocal of the arithmetic mean of the reciprocals of the data,
322 and is often appropriate when averaging quantities which are rates
323 or ratios, for example speeds. Example:
324
325 Suppose an investor purchases an equal value of shares in each of
326 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
327 What is the average P/E ratio for the investor's portfolio?
328
329 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
330 3.6
331
332 Using the arithmetic mean would give an average of about 5.167, which
333 is too high.
334
335 If ``data`` is empty, or any element is less than zero,
336 ``harmonic_mean`` will raise ``StatisticsError``.
337 """
338 # For a justification for using harmonic mean for P/E ratios, see
339 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
340 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
341 if iter(data) is data:
342 data = list(data)
343 errmsg = 'harmonic mean does not support negative values'
344 n = len(data)
345 if n < 1:
346 raise StatisticsError('harmonic_mean requires at least one data point')
347 elif n == 1:
348 x = data[0]
349 if isinstance(x, (numbers.Real, Decimal)):
350 if x < 0:
351 raise StatisticsError(errmsg)
352 return x
353 else:
354 raise TypeError('unsupported type')
355 try:
356 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
357 except ZeroDivisionError:
358 return 0
359 assert count == n
360 return _convert(n/total, T)
361
362
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700363# FIXME: investigate ways to calculate medians without sorting? Quickselect?
364def median(data):
365 """Return the median (middle value) of numeric data.
366
367 When the number of data points is odd, return the middle data point.
368 When the number of data points is even, the median is interpolated by
369 taking the average of the two middle values:
370
371 >>> median([1, 3, 5])
372 3
373 >>> median([1, 3, 5, 7])
374 4.0
375
376 """
377 data = sorted(data)
378 n = len(data)
379 if n == 0:
380 raise StatisticsError("no median for empty data")
381 if n%2 == 1:
382 return data[n//2]
383 else:
384 i = n//2
385 return (data[i - 1] + data[i])/2
386
387
388def median_low(data):
389 """Return the low median of numeric data.
390
391 When the number of data points is odd, the middle value is returned.
392 When it is even, the smaller of the two middle values is returned.
393
394 >>> median_low([1, 3, 5])
395 3
396 >>> median_low([1, 3, 5, 7])
397 3
398
399 """
400 data = sorted(data)
401 n = len(data)
402 if n == 0:
403 raise StatisticsError("no median for empty data")
404 if n%2 == 1:
405 return data[n//2]
406 else:
407 return data[n//2 - 1]
408
409
410def median_high(data):
411 """Return the high median of data.
412
413 When the number of data points is odd, the middle value is returned.
414 When it is even, the larger of the two middle values is returned.
415
416 >>> median_high([1, 3, 5])
417 3
418 >>> median_high([1, 3, 5, 7])
419 5
420
421 """
422 data = sorted(data)
423 n = len(data)
424 if n == 0:
425 raise StatisticsError("no median for empty data")
426 return data[n//2]
427
428
429def median_grouped(data, interval=1):
Zachary Waredf2660e2015-10-27 22:00:41 -0500430 """Return the 50th percentile (median) of grouped continuous data.
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700431
432 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
433 3.7
434 >>> median_grouped([52, 52, 53, 54])
435 52.5
436
437 This calculates the median as the 50th percentile, and should be
438 used when your data is continuous and grouped. In the above example,
439 the values 1, 2, 3, etc. actually represent the midpoint of classes
440 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
441 class 3.5-4.5, and interpolation is used to estimate it.
442
443 Optional argument ``interval`` represents the class interval, and
444 defaults to 1. Changing the class interval naturally will change the
445 interpolated 50th percentile value:
446
447 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
448 3.25
449 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
450 3.5
451
452 This function does not check whether the data points are at least
453 ``interval`` apart.
454 """
455 data = sorted(data)
456 n = len(data)
457 if n == 0:
458 raise StatisticsError("no median for empty data")
459 elif n == 1:
460 return data[0]
461 # Find the value at the midpoint. Remember this corresponds to the
462 # centre of the class interval.
463 x = data[n//2]
464 for obj in (x, interval):
465 if isinstance(obj, (str, bytes)):
466 raise TypeError('expected number but got %r' % obj)
467 try:
468 L = x - interval/2 # The lower limit of the median interval.
469 except TypeError:
470 # Mixed type. For now we just coerce to float.
471 L = float(x) - float(interval)/2
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000472
473 # Uses bisection search to search for x in data with log(n) time complexity
Martin Panterf1579822016-05-26 06:03:33 +0000474 # Find the position of leftmost occurrence of x in data
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000475 l1 = _find_lteq(data, x)
Martin Panterf1579822016-05-26 06:03:33 +0000476 # Find the position of rightmost occurrence of x in data[l1...len(data)]
Steven D'Aprano3b06e242016-05-05 03:54:29 +1000477 # Assuming always l1 <= l2
478 l2 = _find_rteq(data, l1, x)
479 cf = l1
480 f = l2 - l1 + 1
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700481 return L + interval*(n/2 - cf)/f
482
483
484def mode(data):
485 """Return the most common data point from discrete or nominal data.
486
487 ``mode`` assumes discrete data, and returns a single value. This is the
488 standard treatment of the mode as commonly taught in schools:
489
490 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
491 3
492
493 This also works with nominal (non-numeric) data:
494
495 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
496 'red'
497
498 If there is not exactly one most common value, ``mode`` will raise
499 StatisticsError.
500 """
501 # Generate a table of sorted (value, frequency) pairs.
502 table = _counts(data)
503 if len(table) == 1:
504 return table[0][0]
505 elif table:
506 raise StatisticsError(
507 'no unique mode; found %d equally common values' % len(table)
508 )
509 else:
510 raise StatisticsError('no mode for empty data')
511
512
513# === Measures of spread ===
514
515# See http://mathworld.wolfram.com/Variance.html
516# http://mathworld.wolfram.com/SampleVariance.html
517# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
518#
519# Under no circumstances use the so-called "computational formula for
520# variance", as that is only suitable for hand calculations with a small
521# amount of low-precision data. It has terrible numeric properties.
522#
523# See a comparison of three computational methods here:
524# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
525
526def _ss(data, c=None):
527 """Return sum of square deviations of sequence data.
528
529 If ``c`` is None, the mean is calculated in one pass, and the deviations
530 from the mean are calculated in a second pass. Otherwise, deviations are
531 calculated from ``c`` as given. Use the second case with care, as it can
532 lead to garbage results.
533 """
534 if c is None:
535 c = mean(data)
Steven D'Apranob28c3272015-12-01 19:59:53 +1100536 T, total, count = _sum((x-c)**2 for x in data)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700537 # The following sum should mathematically equal zero, but due to rounding
538 # error may not.
Steven D'Apranob28c3272015-12-01 19:59:53 +1100539 U, total2, count2 = _sum((x-c) for x in data)
540 assert T == U and count == count2
541 total -= total2**2/len(data)
542 assert not total < 0, 'negative sum of square deviations: %f' % total
543 return (T, total)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700544
545
546def variance(data, xbar=None):
547 """Return the sample variance of data.
548
549 data should be an iterable of Real-valued numbers, with at least two
550 values. The optional argument xbar, if given, should be the mean of
551 the data. If it is missing or None, the mean is automatically calculated.
552
553 Use this function when your data is a sample from a population. To
554 calculate the variance from the entire population, see ``pvariance``.
555
556 Examples:
557
558 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
559 >>> variance(data)
560 1.3720238095238095
561
562 If you have already calculated the mean of your data, you can pass it as
563 the optional second argument ``xbar`` to avoid recalculating it:
564
565 >>> m = mean(data)
566 >>> variance(data, m)
567 1.3720238095238095
568
569 This function does not check that ``xbar`` is actually the mean of
570 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
571 impossible results.
572
573 Decimals and Fractions are supported:
574
575 >>> from decimal import Decimal as D
576 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
577 Decimal('31.01875')
578
579 >>> from fractions import Fraction as F
580 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
581 Fraction(67, 108)
582
583 """
584 if iter(data) is data:
585 data = list(data)
586 n = len(data)
587 if n < 2:
588 raise StatisticsError('variance requires at least two data points')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100589 T, ss = _ss(data, xbar)
590 return _convert(ss/(n-1), T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700591
592
593def pvariance(data, mu=None):
594 """Return the population variance of ``data``.
595
596 data should be an iterable of Real-valued numbers, with at least one
597 value. The optional argument mu, if given, should be the mean of
598 the data. If it is missing or None, the mean is automatically calculated.
599
600 Use this function to calculate the variance from the entire population.
601 To estimate the variance from a sample, the ``variance`` function is
602 usually a better choice.
603
604 Examples:
605
606 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
607 >>> pvariance(data)
608 1.25
609
610 If you have already calculated the mean of the data, you can pass it as
611 the optional second argument to avoid recalculating it:
612
613 >>> mu = mean(data)
614 >>> pvariance(data, mu)
615 1.25
616
617 This function does not check that ``mu`` is actually the mean of ``data``.
618 Giving arbitrary values for ``mu`` may lead to invalid or impossible
619 results.
620
621 Decimals and Fractions are supported:
622
623 >>> from decimal import Decimal as D
624 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
625 Decimal('24.815')
626
627 >>> from fractions import Fraction as F
628 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
629 Fraction(13, 72)
630
631 """
632 if iter(data) is data:
633 data = list(data)
634 n = len(data)
635 if n < 1:
636 raise StatisticsError('pvariance requires at least one data point')
Steven D'Apranob28c3272015-12-01 19:59:53 +1100637 T, ss = _ss(data, mu)
638 return _convert(ss/n, T)
Larry Hastingsf5e987b2013-10-19 11:50:09 -0700639
640
641def stdev(data, xbar=None):
642 """Return the square root of the sample variance.
643
644 See ``variance`` for arguments and other details.
645
646 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
647 1.0810874155219827
648
649 """
650 var = variance(data, xbar)
651 try:
652 return var.sqrt()
653 except AttributeError:
654 return math.sqrt(var)
655
656
657def pstdev(data, mu=None):
658 """Return the square root of the population variance.
659
660 See ``pvariance`` for arguments and other details.
661
662 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
663 0.986893273527251
664
665 """
666 var = pvariance(data, mu)
667 try:
668 return var.sqrt()
669 except AttributeError:
670 return math.sqrt(var)