Blame - Lib/statistics.py - platform/external/python/cpython3

blob: a18241ba0434933c8118fc28a4b138720bad36e2 [file] [log] [blame]

Larry Hastings	f5e987b	2013-10-19 11:50:09 -0700	[diff] [blame^]	1	## Module statistics.py
				2	##
				3	## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
				4	##
				5	## Licensed under the Apache License, Version 2.0 (the "License");
				6	## you may not use this file except in compliance with the License.
				7	## You may obtain a copy of the License at
				8	##
				9	## http://www.apache.org/licenses/LICENSE-2.0
				10	##
				11	## Unless required by applicable law or agreed to in writing, software
				12	## distributed under the License is distributed on an "AS IS" BASIS,
				13	## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	## See the License for the specific language governing permissions and
				15	## limitations under the License.
				16
				17
				18	"""
				19	Basic statistics module.
				20
				21	This module provides functions for calculating statistics of data, including
				22	averages, variance, and standard deviation.
				23
				24	Calculating averages
				25	--------------------
				26
				27	================== =============================================
				28	Function Description
				29	================== =============================================
				30	mean Arithmetic mean (average) of data.
				31	median Median (middle value) of data.
				32	median_low Low median of data.
				33	median_high High median of data.
				34	median_grouped Median, or 50th percentile, of grouped data.
				35	mode Mode (most common value) of data.
				36	================== =============================================
				37
				38	Calculate the arithmetic mean ("the average") of data:
				39
				40	>>> mean([-1.0, 2.5, 3.25, 5.75])
				41	2.625
				42
				43
				44	Calculate the standard median of discrete data:
				45
				46	>>> median([2, 3, 4, 5])
				47	3.5
				48
				49
				50	Calculate the median, or 50th percentile, of data grouped into class intervals
				51	centred on the data values provided. E.g. if your data points are rounded to
				52	the nearest whole number:
				53
				54	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				55	2.8333333333...
				56
				57	This should be interpreted in this way: you have two data points in the class
				58	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				59	the class interval 3.5-4.5. The median of these data points is 2.8333...
				60
				61
				62	Calculating variability or spread
				63	---------------------------------
				64
				65	================== =============================================
				66	Function Description
				67	================== =============================================
				68	pvariance Population variance of data.
				69	variance Sample variance of data.
				70	pstdev Population standard deviation of data.
				71	stdev Sample standard deviation of data.
				72	================== =============================================
				73
				74	Calculate the standard deviation of sample data:
				75
				76	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				77	4.38961843444...
				78
				79	If you have previously calculated the mean, you can pass it as the optional
				80	second argument to the four "spread" functions to avoid recalculating it:
				81
				82	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				83	>>> mu = mean(data)
				84	>>> pvariance(data, mu)
				85	2.5
				86
				87
				88	Exceptions
				89	----------
				90
				91	A single exception is defined: StatisticsError is a subclass of ValueError.
				92
				93	"""
				94
				95	__all__ = [ 'StatisticsError',
				96	'pstdev', 'pvariance', 'stdev', 'variance',
				97	'median', 'median_low', 'median_high', 'median_grouped',
				98	'mean', 'mode',
				99	]
				100
				101
				102	import collections
				103	import math
				104	import numbers
				105	import operator
				106
				107	from fractions import Fraction
				108	from decimal import Decimal
				109
				110
				111	# === Exceptions ===
				112
				113	class StatisticsError(ValueError):
				114	pass
				115
				116
				117	# === Private utilities ===
				118
				119	def _sum(data, start=0):
				120	"""_sum(data [, start]) -> value
				121
				122	Return a high-precision sum of the given numeric data. If optional
				123	argument ``start`` is given, it is added to the total. If ``data`` is
				124	empty, ``start`` (defaulting to 0) is returned.
				125
				126
				127	Examples
				128	--------
				129
				130	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
				131	11.0
				132
				133	Some sources of round-off error will be avoided:
				134
				135	>>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
				136	1000.0
				137
				138	Fractions and Decimals are also supported:
				139
				140	>>> from fractions import Fraction as F
				141	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
				142	Fraction(63, 20)
				143
				144	>>> from decimal import Decimal as D
				145	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				146	>>> _sum(data)
				147	Decimal('0.6963')
				148
				149	"""
				150	n, d = _exact_ratio(start)
				151	T = type(start)
				152	partials = {d: n} # map {denominator: sum of numerators}
				153	# Micro-optimizations.
				154	coerce_types = _coerce_types
				155	exact_ratio = _exact_ratio
				156	partials_get = partials.get
				157	# Add numerators for each denominator, and track the "current" type.
				158	for x in data:
				159	T = _coerce_types(T, type(x))
				160	n, d = exact_ratio(x)
				161	partials[d] = partials_get(d, 0) + n
				162	if None in partials:
				163	assert issubclass(T, (float, Decimal))
				164	assert not math.isfinite(partials[None])
				165	return T(partials[None])
				166	total = Fraction()
				167	for d, n in sorted(partials.items()):
				168	total += Fraction(n, d)
				169	if issubclass(T, int):
				170	assert total.denominator == 1
				171	return T(total.numerator)
				172	if issubclass(T, Decimal):
				173	return T(total.numerator)/total.denominator
				174	return T(total)
				175
				176
				177	def _exact_ratio(x):
				178	"""Convert Real number x exactly to (numerator, denominator) pair.
				179
				180	>>> _exact_ratio(0.25)
				181	(1, 4)
				182
				183	x is expected to be an int, Fraction, Decimal or float.
				184	"""
				185	try:
				186	try:
				187	# int, Fraction
				188	return (x.numerator, x.denominator)
				189	except AttributeError:
				190	# float
				191	try:
				192	return x.as_integer_ratio()
				193	except AttributeError:
				194	# Decimal
				195	try:
				196	return _decimal_to_ratio(x)
				197	except AttributeError:
				198	msg = "can't convert type '{}' to numerator/denominator"
				199	raise TypeError(msg.format(type(x).__name__)) from None
				200	except (OverflowError, ValueError):
				201	# INF or NAN
				202	if __debug__:
				203	# Decimal signalling NANs cannot be converted to float :-(
				204	if isinstance(x, Decimal):
				205	assert not x.is_finite()
				206	else:
				207	assert not math.isfinite(x)
				208	return (x, None)
				209
				210
				211	# FIXME This is faster than Fraction.from_decimal, but still too slow.
				212	def _decimal_to_ratio(d):
				213	"""Convert Decimal d to exact integer ratio (numerator, denominator).
				214
				215	>>> from decimal import Decimal
				216	>>> _decimal_to_ratio(Decimal("2.6"))
				217	(26, 10)
				218
				219	"""
				220	sign, digits, exp = d.as_tuple()
				221	if exp in ('F', 'n', 'N'): # INF, NAN, sNAN
				222	assert not d.is_finite()
				223	raise ValueError
				224	num = 0
				225	for digit in digits:
				226	num = num*10 + digit
				227	if sign:
				228	num = -num
				229	den = 10**-exp
				230	return (num, den)
				231
				232
				233	def _coerce_types(T1, T2):
				234	"""Coerce types T1 and T2 to a common type.
				235
				236	>>> _coerce_types(int, float)
				237	<class 'float'>
				238
				239	Coercion is performed according to this table, where "N/A" means
				240	that a TypeError exception is raised.
				241
				242	+----------+-----------+-----------+-----------+----------+
				243	\| \| int \| Fraction \| Decimal \| float \|
				244	+----------+-----------+-----------+-----------+----------+
				245	\| int \| int \| Fraction \| Decimal \| float \|
				246	\| Fraction \| Fraction \| Fraction \| N/A \| float \|
				247	\| Decimal \| Decimal \| N/A \| Decimal \| float \|
				248	\| float \| float \| float \| float \| float \|
				249	+----------+-----------+-----------+-----------+----------+
				250
				251	Subclasses trump their parent class; two subclasses of the same
				252	base class will be coerced to the second of the two.
				253
				254	"""
				255	# Get the common/fast cases out of the way first.
				256	if T1 is T2: return T1
				257	if T1 is int: return T2
				258	if T2 is int: return T1
				259	# Subclasses trump their parent class.
				260	if issubclass(T2, T1): return T2
				261	if issubclass(T1, T2): return T1
				262	# Floats trump everything else.
				263	if issubclass(T2, float): return T2
				264	if issubclass(T1, float): return T1
				265	# Subclasses of the same base class give priority to the second.
				266	if T1.__base__ is T2.__base__: return T2
				267	# Otherwise, just give up.
				268	raise TypeError('cannot coerce types %r and %r' % (T1, T2))
				269
				270
				271	def _counts(data):
				272	# Generate a table of sorted (value, frequency) pairs.
				273	if data is None:
				274	raise TypeError('None is not iterable')
				275	table = collections.Counter(data).most_common()
				276	if not table:
				277	return table
				278	# Extract the values with the highest frequency.
				279	maxfreq = table[0][1]
				280	for i in range(1, len(table)):
				281	if table[i][1] != maxfreq:
				282	table = table[:i]
				283	break
				284	return table
				285
				286
				287	# === Measures of central tendency (averages) ===
				288
				289	def mean(data):
				290	"""Return the sample arithmetic mean of data.
				291
				292	>>> mean([1, 2, 3, 4, 4])
				293	2.8
				294
				295	>>> from fractions import Fraction as F
				296	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				297	Fraction(13, 21)
				298
				299	>>> from decimal import Decimal as D
				300	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				301	Decimal('0.5625')
				302
				303	If ``data`` is empty, StatisticsError will be raised.
				304	"""
				305	if iter(data) is data:
				306	data = list(data)
				307	n = len(data)
				308	if n < 1:
				309	raise StatisticsError('mean requires at least one data point')
				310	return _sum(data)/n
				311
				312
				313	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				314	def median(data):
				315	"""Return the median (middle value) of numeric data.
				316
				317	When the number of data points is odd, return the middle data point.
				318	When the number of data points is even, the median is interpolated by
				319	taking the average of the two middle values:
				320
				321	>>> median([1, 3, 5])
				322	3
				323	>>> median([1, 3, 5, 7])
				324	4.0
				325
				326	"""
				327	data = sorted(data)
				328	n = len(data)
				329	if n == 0:
				330	raise StatisticsError("no median for empty data")
				331	if n%2 == 1:
				332	return data[n//2]
				333	else:
				334	i = n//2
				335	return (data[i - 1] + data[i])/2
				336
				337
				338	def median_low(data):
				339	"""Return the low median of numeric data.
				340
				341	When the number of data points is odd, the middle value is returned.
				342	When it is even, the smaller of the two middle values is returned.
				343
				344	>>> median_low([1, 3, 5])
				345	3
				346	>>> median_low([1, 3, 5, 7])
				347	3
				348
				349	"""
				350	data = sorted(data)
				351	n = len(data)
				352	if n == 0:
				353	raise StatisticsError("no median for empty data")
				354	if n%2 == 1:
				355	return data[n//2]
				356	else:
				357	return data[n//2 - 1]
				358
				359
				360	def median_high(data):
				361	"""Return the high median of data.
				362
				363	When the number of data points is odd, the middle value is returned.
				364	When it is even, the larger of the two middle values is returned.
				365
				366	>>> median_high([1, 3, 5])
				367	3
				368	>>> median_high([1, 3, 5, 7])
				369	5
				370
				371	"""
				372	data = sorted(data)
				373	n = len(data)
				374	if n == 0:
				375	raise StatisticsError("no median for empty data")
				376	return data[n//2]
				377
				378
				379	def median_grouped(data, interval=1):
				380	""""Return the 50th percentile (median) of grouped continuous data.
				381
				382	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				383	3.7
				384	>>> median_grouped([52, 52, 53, 54])
				385	52.5
				386
				387	This calculates the median as the 50th percentile, and should be
				388	used when your data is continuous and grouped. In the above example,
				389	the values 1, 2, 3, etc. actually represent the midpoint of classes
				390	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				391	class 3.5-4.5, and interpolation is used to estimate it.
				392
				393	Optional argument ``interval`` represents the class interval, and
				394	defaults to 1. Changing the class interval naturally will change the
				395	interpolated 50th percentile value:
				396
				397	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				398	3.25
				399	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				400	3.5
				401
				402	This function does not check whether the data points are at least
				403	``interval`` apart.
				404	"""
				405	data = sorted(data)
				406	n = len(data)
				407	if n == 0:
				408	raise StatisticsError("no median for empty data")
				409	elif n == 1:
				410	return data[0]
				411	# Find the value at the midpoint. Remember this corresponds to the
				412	# centre of the class interval.
				413	x = data[n//2]
				414	for obj in (x, interval):
				415	if isinstance(obj, (str, bytes)):
				416	raise TypeError('expected number but got %r' % obj)
				417	try:
				418	L = x - interval/2 # The lower limit of the median interval.
				419	except TypeError:
				420	# Mixed type. For now we just coerce to float.
				421	L = float(x) - float(interval)/2
				422	cf = data.index(x) # Number of values below the median interval.
				423	# FIXME The following line could be more efficient for big lists.
				424	f = data.count(x) # Number of data points in the median interval.
				425	return L + interval*(n/2 - cf)/f
				426
				427
				428	def mode(data):
				429	"""Return the most common data point from discrete or nominal data.
				430
				431	``mode`` assumes discrete data, and returns a single value. This is the
				432	standard treatment of the mode as commonly taught in schools:
				433
				434	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				435	3
				436
				437	This also works with nominal (non-numeric) data:
				438
				439	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				440	'red'
				441
				442	If there is not exactly one most common value, ``mode`` will raise
				443	StatisticsError.
				444	"""
				445	# Generate a table of sorted (value, frequency) pairs.
				446	table = _counts(data)
				447	if len(table) == 1:
				448	return table[0][0]
				449	elif table:
				450	raise StatisticsError(
				451	'no unique mode; found %d equally common values' % len(table)
				452	)
				453	else:
				454	raise StatisticsError('no mode for empty data')
				455
				456
				457	# === Measures of spread ===
				458
				459	# See http://mathworld.wolfram.com/Variance.html
				460	# http://mathworld.wolfram.com/SampleVariance.html
				461	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				462	#
				463	# Under no circumstances use the so-called "computational formula for
				464	# variance", as that is only suitable for hand calculations with a small
				465	# amount of low-precision data. It has terrible numeric properties.
				466	#
				467	# See a comparison of three computational methods here:
				468	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				469
				470	def _ss(data, c=None):
				471	"""Return sum of square deviations of sequence data.
				472
				473	If ``c`` is None, the mean is calculated in one pass, and the deviations
				474	from the mean are calculated in a second pass. Otherwise, deviations are
				475	calculated from ``c`` as given. Use the second case with care, as it can
				476	lead to garbage results.
				477	"""
				478	if c is None:
				479	c = mean(data)
				480	ss = _sum((x-c)**2 for x in data)
				481	# The following sum should mathematically equal zero, but due to rounding
				482	# error may not.
				483	ss -= _sum((x-c) for x in data)**2/len(data)
				484	assert not ss < 0, 'negative sum of square deviations: %f' % ss
				485	return ss
				486
				487
				488	def variance(data, xbar=None):
				489	"""Return the sample variance of data.
				490
				491	data should be an iterable of Real-valued numbers, with at least two
				492	values. The optional argument xbar, if given, should be the mean of
				493	the data. If it is missing or None, the mean is automatically calculated.
				494
				495	Use this function when your data is a sample from a population. To
				496	calculate the variance from the entire population, see ``pvariance``.
				497
				498	Examples:
				499
				500	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				501	>>> variance(data)
				502	1.3720238095238095
				503
				504	If you have already calculated the mean of your data, you can pass it as
				505	the optional second argument ``xbar`` to avoid recalculating it:
				506
				507	>>> m = mean(data)
				508	>>> variance(data, m)
				509	1.3720238095238095
				510
				511	This function does not check that ``xbar`` is actually the mean of
				512	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				513	impossible results.
				514
				515	Decimals and Fractions are supported:
				516
				517	>>> from decimal import Decimal as D
				518	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				519	Decimal('31.01875')
				520
				521	>>> from fractions import Fraction as F
				522	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				523	Fraction(67, 108)
				524
				525	"""
				526	if iter(data) is data:
				527	data = list(data)
				528	n = len(data)
				529	if n < 2:
				530	raise StatisticsError('variance requires at least two data points')
				531	ss = _ss(data, xbar)
				532	return ss/(n-1)
				533
				534
				535	def pvariance(data, mu=None):
				536	"""Return the population variance of ``data``.
				537
				538	data should be an iterable of Real-valued numbers, with at least one
				539	value. The optional argument mu, if given, should be the mean of
				540	the data. If it is missing or None, the mean is automatically calculated.
				541
				542	Use this function to calculate the variance from the entire population.
				543	To estimate the variance from a sample, the ``variance`` function is
				544	usually a better choice.
				545
				546	Examples:
				547
				548	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				549	>>> pvariance(data)
				550	1.25
				551
				552	If you have already calculated the mean of the data, you can pass it as
				553	the optional second argument to avoid recalculating it:
				554
				555	>>> mu = mean(data)
				556	>>> pvariance(data, mu)
				557	1.25
				558
				559	This function does not check that ``mu`` is actually the mean of ``data``.
				560	Giving arbitrary values for ``mu`` may lead to invalid or impossible
				561	results.
				562
				563	Decimals and Fractions are supported:
				564
				565	>>> from decimal import Decimal as D
				566	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				567	Decimal('24.815')
				568
				569	>>> from fractions import Fraction as F
				570	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				571	Fraction(13, 72)
				572
				573	"""
				574	if iter(data) is data:
				575	data = list(data)
				576	n = len(data)
				577	if n < 1:
				578	raise StatisticsError('pvariance requires at least one data point')
				579	ss = _ss(data, mu)
				580	return ss/n
				581
				582
				583	def stdev(data, xbar=None):
				584	"""Return the square root of the sample variance.
				585
				586	See ``variance`` for arguments and other details.
				587
				588	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				589	1.0810874155219827
				590
				591	"""
				592	var = variance(data, xbar)
				593	try:
				594	return var.sqrt()
				595	except AttributeError:
				596	return math.sqrt(var)
				597
				598
				599	def pstdev(data, mu=None):
				600	"""Return the square root of the population variance.
				601
				602	See ``pvariance`` for arguments and other details.
				603
				604	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				605	0.986893273527251
				606
				607	"""
				608	var = pvariance(data, mu)
				609	try:
				610	return var.sqrt()
				611	except AttributeError:
				612	return math.sqrt(var)