Blame - Lib/csv.py - platform/external/python/cpython3

blob: 7e297b69e7a43073f7b102a6069ac9d54fbe1fde [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	12	try:
				13	from cStringIO import StringIO
				14	except ImportError:
				15	from StringIO import StringIO
				16
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				21
				22	class Dialect:
				23	_name = ""
				24	_valid = False
				25	# placeholders
				26	delimiter = None
				27	quotechar = None
				28	escapechar = None
				29	doublequote = None
				30	skipinitialspace = None
				31	lineterminator = None
				32	quoting = None
				33
				34	def __init__(self):
				35	if self.__class__ != Dialect:
				36	self._valid = True
				37	errors = self._validate()
				38	if errors != []:
				39	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				40
				41	def _validate(self):
				42	errors = []
				43	if not self._valid:
				44	errors.append("can't directly instantiate Dialect class")
				45
				46	if self.delimiter is None:
				47	errors.append("delimiter character not set")
				48	elif (not isinstance(self.delimiter, str) or
				49	len(self.delimiter) > 1):
				50	errors.append("delimiter must be one-character string")
				51
				52	if self.quotechar is None:
				53	if self.quoting != QUOTE_NONE:
				54	errors.append("quotechar not set")
				55	elif (not isinstance(self.quotechar, str) or
				56	len(self.quotechar) > 1):
				57	errors.append("quotechar must be one-character string")
				58
				59	if self.lineterminator is None:
				60	errors.append("lineterminator not set")
				61	elif not isinstance(self.lineterminator, str):
				62	errors.append("lineterminator must be a string")
				63
				64	if self.doublequote not in (True, False):
				65	errors.append("doublequote parameter must be True or False")
				66
				67	if self.skipinitialspace not in (True, False):
				68	errors.append("skipinitialspace parameter must be True or False")
				69
				70	if self.quoting is None:
				71	errors.append("quoting parameter not set")
				72
				73	if self.quoting is QUOTE_NONE:
				74	if (not isinstance(self.escapechar, (unicode, str)) or
				75	len(self.escapechar) > 1):
				76	errors.append("escapechar must be a one-character string or unicode object")
				77
				78	return errors
				79
				80	class excel(Dialect):
				81	delimiter = ','
				82	quotechar = '"'
				83	doublequote = True
				84	skipinitialspace = False
				85	lineterminator = '\r\n'
				86	quoting = QUOTE_MINIMAL
				87	register_dialect("excel", excel)
				88
				89	class excel_tab(excel):
				90	delimiter = '\t'
				91	register_dialect("excel-tab", excel_tab)
				92
				93
				94	class DictReader:
				95	def __init__(self, f, fieldnames, restkey=None, restval=None,
				96	dialect="excel", *args):
				97	self.fieldnames = fieldnames # list of keys for the dict
				98	self.restkey = restkey # key to catch long rows
				99	self.restval = restval # default value for short rows
				100	self.reader = reader(f, dialect, *args)
				101
				102	def __iter__(self):
				103	return self
				104
				105	def next(self):
				106	row = self.reader.next()
				107	# unlike the basic reader, we prefer not to return blanks,
				108	# because we will typically wind up with a dict full of None
				109	# values
				110	while row == []:
				111	row = self.reader.next()
				112	d = dict(zip(self.fieldnames, row))
				113	lf = len(self.fieldnames)
				114	lr = len(row)
				115	if lf < lr:
				116	d[self.restkey] = row[lf:]
				117	elif lf > lr:
				118	for key in self.fieldnames[lr:]:
				119	d[key] = self.restval
				120	return d
				121
				122
				123	class DictWriter:
				124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
				125	dialect="excel", *args):
				126	self.fieldnames = fieldnames # list of keys for the dict
				127	self.restval = restval # for writing short dicts
				128	if extrasaction.lower() not in ("raise", "ignore"):
				129	raise ValueError, \
				130	("extrasaction (%s) must be 'raise' or 'ignore'" %
				131	extrasaction)
				132	self.extrasaction = extrasaction
				133	self.writer = writer(f, dialect, *args)
				134
				135	def _dict_to_list(self, rowdict):
				136	if self.extrasaction == "raise":
				137	for k in rowdict.keys():
				138	if k not in self.fieldnames:
				139	raise ValueError, "dict contains fields not in fieldnames"
				140	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				141
				142	def writerow(self, rowdict):
				143	return self.writer.writerow(self._dict_to_list(rowdict))
				144
				145	def writerows(self, rowdicts):
				146	rows = []
				147	for rowdict in rowdicts:
				148	rows.append(self._dict_to_list(rowdict))
				149	return self.writer.writerows(rows)
				150
				151
				152	class Sniffer:
				153	'''
				154	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	155	Returns a Dialect object.
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	156	'''
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	157	def __init__(self):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	158	# in case there is more than one possible delimiter
				159	self.preferred = [',', '\t', ';', ' ', ':']
				160
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	161
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	162	def sniff(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	163	"""
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	164	Returns a dialect (or None) corresponding to the sample
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	165	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	166
				167	quotechar, delimiter, skipinitialspace = \
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	168	self._guess_quote_and_delimiter(sample)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	169	if delimiter is None:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	170	delimiter, skipinitialspace = self._guess_delimiter(sample)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	171
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	172	class dialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	173	_name = "sniffed"
				174	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	175	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	176	# escapechar = ''
				177	doublequote = False
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	178
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	179	dialect.delimiter = delimiter
				180	# _csv.reader won't accept a quotechar of ''
				181	dialect.quotechar = quotechar or '"'
				182	dialect.skipinitialspace = skipinitialspace
				183
				184	return dialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	185
				186
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	187	def _guess_quote_and_delimiter(self, data):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	188	"""
				189	Looks for text enclosed between two identical quotes
				190	(the probable quotechar) which are preceded and followed
				191	by the same character (the probable delimiter).
				192	For example:
				193	,'some text',
				194	The quote with the most wins, same with the delimiter.
				195	If there is no quotechar the delimiter can't be determined
				196	this way.
				197	"""
				198
				199	matches = []
				200	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				201	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				202	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				203	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
				204	regexp = re.compile(restr, re.S \| re.M)
				205	matches = regexp.findall(data)
				206	if matches:
				207	break
				208
				209	if not matches:
				210	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				211
				212	quotes = {}
				213	delims = {}
				214	spaces = 0
				215	for m in matches:
				216	n = regexp.groupindex['quote'] - 1
				217	key = m[n]
				218	if key:
				219	quotes[key] = quotes.get(key, 0) + 1
				220	try:
				221	n = regexp.groupindex['delim'] - 1
				222	key = m[n]
				223	except KeyError:
				224	continue
				225	if key:
				226	delims[key] = delims.get(key, 0) + 1
				227	try:
				228	n = regexp.groupindex['space'] - 1
				229	except KeyError:
				230	continue
				231	if m[n]:
				232	spaces += 1
				233
				234	quotechar = reduce(lambda a, b, quotes = quotes:
				235	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				236
				237	if delims:
				238	delim = reduce(lambda a, b, delims = delims:
				239	(delims[a] > delims[b]) and a or b, delims.keys())
				240	skipinitialspace = delims[delim] == spaces
				241	if delim == '\n': # most likely a file with a single column
				242	delim = ''
				243	else:
				244	# there is no delimiter, it's a single column of quoted data
				245	delim = ''
				246	skipinitialspace = 0
				247
				248	return (quotechar, delim, skipinitialspace)
				249
				250
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	251	def _guess_delimiter(self, data):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	252	"""
				253	The delimiter /should/ occur the same number of times on
				254	each row. However, due to malformed data, it may not. We don't want
				255	an all or nothing approach, so we allow for small variations in this
				256	number.
				257	1) build a table of the frequency of each character on every line.
				258	2) build a table of freqencies of this frequency (meta-frequency?),
				259	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				260	7 times in 2 rows'
				261	3) use the mode of the meta-frequency to determine the /expected/
				262	frequency for that character
				263	4) find out how often the character actually meets that goal
				264	5) the character that best meets its goal is the delimiter
				265	For performance reasons, the data is evaluated in chunks, so it can
				266	try and evaluate the smallest portion of the data possible, evaluating
				267	additional chunks as necessary.
				268	"""
				269
				270	data = filter(None, data.split('\n'))
				271
				272	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				273
				274	# build frequency tables
				275	chunkLength = min(10, len(data))
				276	iteration = 0
				277	charFrequency = {}
				278	modes = {}
				279	delims = {}
				280	start, end = 0, min(chunkLength, len(data))
				281	while start < len(data):
				282	iteration += 1
				283	for line in data[start:end]:
				284	for char in ascii:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	285	metaFrequency = charFrequency.get(char, {})
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	286	# must count even if frequency is 0
				287	freq = line.strip().count(char)
				288	# value is the mode
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	289	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				290	charFrequency[char] = metaFrequency
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	291
				292	for char in charFrequency.keys():
				293	items = charFrequency[char].items()
				294	if len(items) == 1 and items[0][0] == 0:
				295	continue
				296	# get the mode of the frequencies
				297	if len(items) > 1:
				298	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				299	items)
				300	# adjust the mode - subtract the sum of all
				301	# other frequencies
				302	items.remove(modes[char])
				303	modes[char] = (modes[char][0], modes[char][1]
				304	- reduce(lambda a, b: (0, a[1] + b[1]),
				305	items)[1])
				306	else:
				307	modes[char] = items[0]
				308
				309	# build a list of possible delimiters
				310	modeList = modes.items()
				311	total = float(chunkLength * iteration)
				312	# (rows of consistent data) / (number of rows) = 100%
				313	consistency = 1.0
				314	# minimum consistency threshold
				315	threshold = 0.9
				316	while len(delims) == 0 and consistency >= threshold:
				317	for k, v in modeList:
				318	if v[0] > 0 and v[1] > 0:
				319	if (v[1]/total) >= consistency:
				320	delims[k] = v
				321	consistency -= 0.01
				322
				323	if len(delims) == 1:
				324	delim = delims.keys()[0]
				325	skipinitialspace = (data[0].count(delim) ==
				326	data[0].count("%c " % delim))
				327	return (delim, skipinitialspace)
				328
				329	# analyze another chunkLength lines
				330	start = end
				331	end += chunkLength
				332
				333	if not delims:
				334	return ('', 0)
				335
				336	# if there's more than one, fall back to a 'preferred' list
				337	if len(delims) > 1:
				338	for d in self.preferred:
				339	if d in delims.keys():
				340	skipinitialspace = (data[0].count(d) ==
				341	data[0].count("%c " % d))
				342	return (d, skipinitialspace)
				343
				344	# finally, just return the first damn character in the list
				345	delim = delims.keys()[0]
				346	skipinitialspace = (data[0].count(delim) ==
				347	data[0].count("%c " % delim))
				348	return (delim, skipinitialspace)
				349
				350
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	351	def has_header(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	352	# Creates a dictionary of types of data in each column. If any
				353	# column is of a single type (say, integers), except for the first
				354	# row, then the first row is presumed to be labels. If the type
				355	# can't be determined, it is assumed to be a string in which case
				356	# the length of the string is the determining factor: if all of the
				357	# rows except for the first are the same length, it's a header.
				358	# Finally, a 'vote' is taken at the end for each column, adding or
				359	# subtracting from the likelihood of the first row being a header.
				360
				361	def seval(item):
				362	"""
				363	Strips parens from item prior to calling eval in an
				364	attempt to make it safer
				365	"""
				366	return eval(item.replace('(', '').replace(')', ''))
				367
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	368	rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	369
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	370	header = rdr.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	371
				372	columns = len(header)
				373	columnTypes = {}
				374	for i in range(columns): columnTypes[i] = None
				375
				376	checked = 0
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	377	for row in rdr:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	378	# arbitrary number of rows to check, to keep it sane
				379	if checked > 20:
				380	break
				381	checked += 1
				382
				383	if len(row) != columns:
				384	continue # skip rows that have irregular number of columns
				385
				386	for col in columnTypes.keys():
				387	try:
				388	try:
				389	# is it a built-in type (besides string)?
				390	thisType = type(seval(row[col]))
				391	except OverflowError:
				392	# a long int?
				393	thisType = type(seval(row[col] + 'L'))
				394	thisType = type(0) # treat long ints as int
				395	except:
				396	# fallback to length of string
				397	thisType = len(row[col])
				398
				399	if thisType != columnTypes[col]:
				400	if columnTypes[col] is None: # add new column type
				401	columnTypes[col] = thisType
				402	else:
				403	# type is inconsistent, remove column from
				404	# consideration
				405	del columnTypes[col]
				406
				407	# finally, compare results against first row and "vote"
				408	# on whether it's a header
				409	hasHeader = 0
				410	for col, colType in columnTypes.items():
				411	if type(colType) == type(0): # it's a length
				412	if len(header[col]) != colType:
				413	hasHeader += 1
				414	else:
				415	hasHeader -= 1
				416	else: # attempt typecast
				417	try:
				418	eval("%s(%s)" % (colType.__name__, header[col]))
				419	except:
				420	hasHeader += 1
				421	else:
				422	hasHeader -= 1
				423
				424	return hasHeader > 0