Blame - Lib/csv.py - platform/external/python/cpython3

blob: d08a86b6be888300ece7ea5c8c7be7b08171086f [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	12	try:
				13	from cStringIO import StringIO
				14	except ImportError:
				15	from StringIO import StringIO
				16
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				21
				22	class Dialect:
Skip Montanaro	f26285c	2005-01-05 06:54:58 +0000	[diff] [blame]	23	"""Describe an Excel dialect.
				24
				25	This must be subclassed (see csv.excel). Valid attributes are:
				26	delimiter, quotechar, escapechar, doublequote, skipinitialspace,
				27	lineterminator, quoting.
				28
				29	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	30	_name = ""
				31	_valid = False
				32	# placeholders
				33	delimiter = None
				34	quotechar = None
				35	escapechar = None
				36	doublequote = None
				37	skipinitialspace = None
				38	lineterminator = None
				39	quoting = None
				40
				41	def __init__(self):
				42	if self.__class__ != Dialect:
				43	self._valid = True
				44	errors = self._validate()
				45	if errors != []:
				46	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				47
				48	def _validate(self):
				49	errors = []
				50	if not self._valid:
				51	errors.append("can't directly instantiate Dialect class")
				52
				53	if self.delimiter is None:
				54	errors.append("delimiter character not set")
				55	elif (not isinstance(self.delimiter, str) or
				56	len(self.delimiter) > 1):
				57	errors.append("delimiter must be one-character string")
				58
				59	if self.quotechar is None:
				60	if self.quoting != QUOTE_NONE:
				61	errors.append("quotechar not set")
				62	elif (not isinstance(self.quotechar, str) or
				63	len(self.quotechar) > 1):
				64	errors.append("quotechar must be one-character string")
				65
				66	if self.lineterminator is None:
				67	errors.append("lineterminator not set")
				68	elif not isinstance(self.lineterminator, str):
				69	errors.append("lineterminator must be a string")
				70
Andrew McNamara	1196cf1	2005-01-07 04:42:45 +0000	[diff] [blame]	71	if self.doublequote not in (True, False) and self.quoting != QUOTE_NONE:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	72	errors.append("doublequote parameter must be True or False")
				73
				74	if self.skipinitialspace not in (True, False):
				75	errors.append("skipinitialspace parameter must be True or False")
				76
				77	if self.quoting is None:
				78	errors.append("quoting parameter not set")
				79
				80	if self.quoting is QUOTE_NONE:
				81	if (not isinstance(self.escapechar, (unicode, str)) or
				82	len(self.escapechar) > 1):
				83	errors.append("escapechar must be a one-character string or unicode object")
				84
				85	return errors
				86
				87	class excel(Dialect):
Skip Montanaro	f26285c	2005-01-05 06:54:58 +0000	[diff] [blame]	88	"""Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	89	delimiter = ','
				90	quotechar = '"'
				91	doublequote = True
				92	skipinitialspace = False
				93	lineterminator = '\r\n'
				94	quoting = QUOTE_MINIMAL
				95	register_dialect("excel", excel)
				96
				97	class excel_tab(excel):
Skip Montanaro	f26285c	2005-01-05 06:54:58 +0000	[diff] [blame]	98	"""Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	99	delimiter = '\t'
				100	register_dialect("excel-tab", excel_tab)
				101
				102
				103	class DictReader:
Skip Montanaro	dffeed3	2003-10-03 14:03:01 +0000	[diff] [blame]	104	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	105	dialect="excel", args, *kwds):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	106	self.fieldnames = fieldnames # list of keys for the dict
				107	self.restkey = restkey # key to catch long rows
				108	self.restval = restval # default value for short rows
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	109	self.reader = reader(f, dialect, args, *kwds)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	110
				111	def __iter__(self):
				112	return self
				113
				114	def next(self):
				115	row = self.reader.next()
Skip Montanaro	dffeed3	2003-10-03 14:03:01 +0000	[diff] [blame]	116	if self.fieldnames is None:
				117	self.fieldnames = row
				118	row = self.reader.next()
				119
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	120	# unlike the basic reader, we prefer not to return blanks,
				121	# because we will typically wind up with a dict full of None
				122	# values
				123	while row == []:
				124	row = self.reader.next()
				125	d = dict(zip(self.fieldnames, row))
				126	lf = len(self.fieldnames)
				127	lr = len(row)
				128	if lf < lr:
				129	d[self.restkey] = row[lf:]
				130	elif lf > lr:
				131	for key in self.fieldnames[lr:]:
				132	d[key] = self.restval
				133	return d
				134
				135
				136	class DictWriter:
				137	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	138	dialect="excel", args, *kwds):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	139	self.fieldnames = fieldnames # list of keys for the dict
				140	self.restval = restval # for writing short dicts
				141	if extrasaction.lower() not in ("raise", "ignore"):
				142	raise ValueError, \
				143	("extrasaction (%s) must be 'raise' or 'ignore'" %
				144	extrasaction)
				145	self.extrasaction = extrasaction
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	146	self.writer = writer(f, dialect, args, *kwds)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	147
				148	def _dict_to_list(self, rowdict):
				149	if self.extrasaction == "raise":
				150	for k in rowdict.keys():
				151	if k not in self.fieldnames:
				152	raise ValueError, "dict contains fields not in fieldnames"
				153	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				154
				155	def writerow(self, rowdict):
				156	return self.writer.writerow(self._dict_to_list(rowdict))
				157
				158	def writerows(self, rowdicts):
				159	rows = []
				160	for rowdict in rowdicts:
				161	rows.append(self._dict_to_list(rowdict))
				162	return self.writer.writerows(rows)
				163
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	164	# Guard Sniffer's type checking against builds that exclude complex()
				165	try:
				166	complex
				167	except NameError:
				168	complex = float
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	169
				170	class Sniffer:
				171	'''
				172	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	173	Returns a Dialect object.
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	174	'''
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	175	def __init__(self):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	176	# in case there is more than one possible delimiter
				177	self.preferred = [',', '\t', ';', ' ', ':']
				178
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	179
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	180	def sniff(self, sample, delimiters=None):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	181	"""
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	182	Returns a dialect (or None) corresponding to the sample
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	183	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	184
				185	quotechar, delimiter, skipinitialspace = \
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	186	self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	187	if delimiter is None:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	188	delimiter, skipinitialspace = self._guess_delimiter(sample,
				189	delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	190
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	191	class dialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	192	_name = "sniffed"
				193	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	194	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	195	# escapechar = ''
				196	doublequote = False
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	197
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	198	dialect.delimiter = delimiter
				199	# _csv.reader won't accept a quotechar of ''
				200	dialect.quotechar = quotechar or '"'
				201	dialect.skipinitialspace = skipinitialspace
				202
				203	return dialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	204
				205
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	206	def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	207	"""
				208	Looks for text enclosed between two identical quotes
				209	(the probable quotechar) which are preceded and followed
				210	by the same character (the probable delimiter).
				211	For example:
				212	,'some text',
				213	The quote with the most wins, same with the delimiter.
				214	If there is no quotechar the delimiter can't be determined
				215	this way.
				216	"""
				217
				218	matches = []
				219	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				220	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				221	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				222	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
Fred Drake	6f7b213	2003-09-02 16:01:07 +0000	[diff] [blame]	223	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	224	matches = regexp.findall(data)
				225	if matches:
				226	break
				227
				228	if not matches:
				229	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				230
				231	quotes = {}
				232	delims = {}
				233	spaces = 0
				234	for m in matches:
				235	n = regexp.groupindex['quote'] - 1
				236	key = m[n]
				237	if key:
				238	quotes[key] = quotes.get(key, 0) + 1
				239	try:
				240	n = regexp.groupindex['delim'] - 1
				241	key = m[n]
				242	except KeyError:
				243	continue
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	244	if key and (delimiters is None or key in delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	245	delims[key] = delims.get(key, 0) + 1
				246	try:
				247	n = regexp.groupindex['space'] - 1
				248	except KeyError:
				249	continue
				250	if m[n]:
				251	spaces += 1
				252
				253	quotechar = reduce(lambda a, b, quotes = quotes:
				254	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				255
				256	if delims:
				257	delim = reduce(lambda a, b, delims = delims:
				258	(delims[a] > delims[b]) and a or b, delims.keys())
				259	skipinitialspace = delims[delim] == spaces
				260	if delim == '\n': # most likely a file with a single column
				261	delim = ''
				262	else:
				263	# there is no delimiter, it's a single column of quoted data
				264	delim = ''
				265	skipinitialspace = 0
				266
				267	return (quotechar, delim, skipinitialspace)
				268
				269
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	270	def _guess_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	271	"""
				272	The delimiter /should/ occur the same number of times on
				273	each row. However, due to malformed data, it may not. We don't want
				274	an all or nothing approach, so we allow for small variations in this
				275	number.
				276	1) build a table of the frequency of each character on every line.
				277	2) build a table of freqencies of this frequency (meta-frequency?),
				278	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				279	7 times in 2 rows'
				280	3) use the mode of the meta-frequency to determine the /expected/
				281	frequency for that character
				282	4) find out how often the character actually meets that goal
				283	5) the character that best meets its goal is the delimiter
				284	For performance reasons, the data is evaluated in chunks, so it can
				285	try and evaluate the smallest portion of the data possible, evaluating
				286	additional chunks as necessary.
				287	"""
				288
				289	data = filter(None, data.split('\n'))
				290
				291	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				292
				293	# build frequency tables
				294	chunkLength = min(10, len(data))
				295	iteration = 0
				296	charFrequency = {}
				297	modes = {}
				298	delims = {}
				299	start, end = 0, min(chunkLength, len(data))
				300	while start < len(data):
				301	iteration += 1
				302	for line in data[start:end]:
				303	for char in ascii:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	304	metaFrequency = charFrequency.get(char, {})
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	305	# must count even if frequency is 0
				306	freq = line.strip().count(char)
				307	# value is the mode
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	308	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				309	charFrequency[char] = metaFrequency
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	310
				311	for char in charFrequency.keys():
				312	items = charFrequency[char].items()
				313	if len(items) == 1 and items[0][0] == 0:
				314	continue
				315	# get the mode of the frequencies
				316	if len(items) > 1:
				317	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				318	items)
				319	# adjust the mode - subtract the sum of all
				320	# other frequencies
				321	items.remove(modes[char])
				322	modes[char] = (modes[char][0], modes[char][1]
				323	- reduce(lambda a, b: (0, a[1] + b[1]),
				324	items)[1])
				325	else:
				326	modes[char] = items[0]
				327
				328	# build a list of possible delimiters
				329	modeList = modes.items()
				330	total = float(chunkLength * iteration)
				331	# (rows of consistent data) / (number of rows) = 100%
				332	consistency = 1.0
				333	# minimum consistency threshold
				334	threshold = 0.9
				335	while len(delims) == 0 and consistency >= threshold:
				336	for k, v in modeList:
				337	if v[0] > 0 and v[1] > 0:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	338	if ((v[1]/total) >= consistency and
				339	(delimiters is None or k in delimiters)):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	340	delims[k] = v
				341	consistency -= 0.01
				342
				343	if len(delims) == 1:
				344	delim = delims.keys()[0]
				345	skipinitialspace = (data[0].count(delim) ==
				346	data[0].count("%c " % delim))
				347	return (delim, skipinitialspace)
				348
				349	# analyze another chunkLength lines
				350	start = end
				351	end += chunkLength
				352
				353	if not delims:
				354	return ('', 0)
				355
				356	# if there's more than one, fall back to a 'preferred' list
				357	if len(delims) > 1:
				358	for d in self.preferred:
				359	if d in delims.keys():
				360	skipinitialspace = (data[0].count(d) ==
				361	data[0].count("%c " % d))
				362	return (d, skipinitialspace)
				363
				364	# finally, just return the first damn character in the list
				365	delim = delims.keys()[0]
				366	skipinitialspace = (data[0].count(delim) ==
				367	data[0].count("%c " % delim))
				368	return (delim, skipinitialspace)
				369
				370
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	371	def has_header(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	372	# Creates a dictionary of types of data in each column. If any
				373	# column is of a single type (say, integers), except for the first
				374	# row, then the first row is presumed to be labels. If the type
				375	# can't be determined, it is assumed to be a string in which case
				376	# the length of the string is the determining factor: if all of the
				377	# rows except for the first are the same length, it's a header.
				378	# Finally, a 'vote' is taken at the end for each column, adding or
				379	# subtracting from the likelihood of the first row being a header.
				380
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	381	rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	382
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	383	header = rdr.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	384
				385	columns = len(header)
				386	columnTypes = {}
				387	for i in range(columns): columnTypes[i] = None
				388
				389	checked = 0
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	390	for row in rdr:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	391	# arbitrary number of rows to check, to keep it sane
				392	if checked > 20:
				393	break
				394	checked += 1
				395
				396	if len(row) != columns:
				397	continue # skip rows that have irregular number of columns
				398
				399	for col in columnTypes.keys():
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	400
				401	for thisType in [int, long, float, complex]:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	402	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	403	thisType(row[col])
				404	break
Raymond Hettinger	abe14e6	2003-06-12 03:59:17 +0000	[diff] [blame]	405	except (ValueError, OverflowError):
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	406	pass
				407	else:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	408	# fallback to length of string
				409	thisType = len(row[col])
				410
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	411	# treat longs as ints
				412	if thisType == long:
				413	thisType = int
				414
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	415	if thisType != columnTypes[col]:
				416	if columnTypes[col] is None: # add new column type
				417	columnTypes[col] = thisType
				418	else:
				419	# type is inconsistent, remove column from
				420	# consideration
				421	del columnTypes[col]
				422
				423	# finally, compare results against first row and "vote"
				424	# on whether it's a header
				425	hasHeader = 0
				426	for col, colType in columnTypes.items():
				427	if type(colType) == type(0): # it's a length
				428	if len(header[col]) != colType:
				429	hasHeader += 1
				430	else:
				431	hasHeader -= 1
				432	else: # attempt typecast
				433	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	434	colType(header[col])
Raymond Hettinger	f31cb0c	2003-06-12 04:05:00 +0000	[diff] [blame]	435	except (ValueError, TypeError):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	436	hasHeader += 1
				437	else:
				438	hasHeader -= 1
				439
				440	return hasHeader > 0