Blame - Lib/csv.py - platform/external/python/cpython2

blob: 83b8aa447def68483c1dc7bb7e910e7856b02432 [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	12	try:
				13	from cStringIO import StringIO
				14	except ImportError:
				15	from StringIO import StringIO
				16
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				21
				22	class Dialect:
				23	_name = ""
				24	_valid = False
				25	# placeholders
				26	delimiter = None
				27	quotechar = None
				28	escapechar = None
				29	doublequote = None
				30	skipinitialspace = None
				31	lineterminator = None
				32	quoting = None
				33
				34	def __init__(self):
				35	if self.__class__ != Dialect:
				36	self._valid = True
				37	errors = self._validate()
				38	if errors != []:
				39	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				40
				41	def _validate(self):
				42	errors = []
				43	if not self._valid:
				44	errors.append("can't directly instantiate Dialect class")
				45
				46	if self.delimiter is None:
				47	errors.append("delimiter character not set")
				48	elif (not isinstance(self.delimiter, str) or
				49	len(self.delimiter) > 1):
				50	errors.append("delimiter must be one-character string")
				51
				52	if self.quotechar is None:
				53	if self.quoting != QUOTE_NONE:
				54	errors.append("quotechar not set")
				55	elif (not isinstance(self.quotechar, str) or
				56	len(self.quotechar) > 1):
				57	errors.append("quotechar must be one-character string")
				58
				59	if self.lineterminator is None:
				60	errors.append("lineterminator not set")
				61	elif not isinstance(self.lineterminator, str):
				62	errors.append("lineterminator must be a string")
				63
				64	if self.doublequote not in (True, False):
				65	errors.append("doublequote parameter must be True or False")
				66
				67	if self.skipinitialspace not in (True, False):
				68	errors.append("skipinitialspace parameter must be True or False")
				69
				70	if self.quoting is None:
				71	errors.append("quoting parameter not set")
				72
				73	if self.quoting is QUOTE_NONE:
				74	if (not isinstance(self.escapechar, (unicode, str)) or
				75	len(self.escapechar) > 1):
				76	errors.append("escapechar must be a one-character string or unicode object")
				77
				78	return errors
				79
				80	class excel(Dialect):
				81	delimiter = ','
				82	quotechar = '"'
				83	doublequote = True
				84	skipinitialspace = False
				85	lineterminator = '\r\n'
				86	quoting = QUOTE_MINIMAL
				87	register_dialect("excel", excel)
				88
				89	class excel_tab(excel):
				90	delimiter = '\t'
				91	register_dialect("excel-tab", excel_tab)
				92
				93
				94	class DictReader:
				95	def __init__(self, f, fieldnames, restkey=None, restval=None,
				96	dialect="excel", *args):
				97	self.fieldnames = fieldnames # list of keys for the dict
				98	self.restkey = restkey # key to catch long rows
				99	self.restval = restval # default value for short rows
				100	self.reader = reader(f, dialect, *args)
				101
				102	def __iter__(self):
				103	return self
				104
				105	def next(self):
				106	row = self.reader.next()
				107	# unlike the basic reader, we prefer not to return blanks,
				108	# because we will typically wind up with a dict full of None
				109	# values
				110	while row == []:
				111	row = self.reader.next()
				112	d = dict(zip(self.fieldnames, row))
				113	lf = len(self.fieldnames)
				114	lr = len(row)
				115	if lf < lr:
				116	d[self.restkey] = row[lf:]
				117	elif lf > lr:
				118	for key in self.fieldnames[lr:]:
				119	d[key] = self.restval
				120	return d
				121
				122
				123	class DictWriter:
				124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
				125	dialect="excel", *args):
				126	self.fieldnames = fieldnames # list of keys for the dict
				127	self.restval = restval # for writing short dicts
				128	if extrasaction.lower() not in ("raise", "ignore"):
				129	raise ValueError, \
				130	("extrasaction (%s) must be 'raise' or 'ignore'" %
				131	extrasaction)
				132	self.extrasaction = extrasaction
				133	self.writer = writer(f, dialect, *args)
				134
				135	def _dict_to_list(self, rowdict):
				136	if self.extrasaction == "raise":
				137	for k in rowdict.keys():
				138	if k not in self.fieldnames:
				139	raise ValueError, "dict contains fields not in fieldnames"
				140	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				141
				142	def writerow(self, rowdict):
				143	return self.writer.writerow(self._dict_to_list(rowdict))
				144
				145	def writerows(self, rowdicts):
				146	rows = []
				147	for rowdict in rowdicts:
				148	rows.append(self._dict_to_list(rowdict))
				149	return self.writer.writerows(rows)
				150
				151
				152	class Sniffer:
				153	'''
				154	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	155	Returns a Dialect object.
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	156	'''
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	157	def __init__(self):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	158	# in case there is more than one possible delimiter
				159	self.preferred = [',', '\t', ';', ' ', ':']
				160
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	161
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	162	def sniff(self, sample, delimiters=None):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	163	"""
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	164	Returns a dialect (or None) corresponding to the sample
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	165	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	166
				167	quotechar, delimiter, skipinitialspace = \
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	168	self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	169	if delimiter is None:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	170	delimiter, skipinitialspace = self._guess_delimiter(sample,
				171	delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	172
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	173	class dialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	174	_name = "sniffed"
				175	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	176	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	177	# escapechar = ''
				178	doublequote = False
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	179
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	180	dialect.delimiter = delimiter
				181	# _csv.reader won't accept a quotechar of ''
				182	dialect.quotechar = quotechar or '"'
				183	dialect.skipinitialspace = skipinitialspace
				184
				185	return dialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	186
				187
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	188	def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	189	"""
				190	Looks for text enclosed between two identical quotes
				191	(the probable quotechar) which are preceded and followed
				192	by the same character (the probable delimiter).
				193	For example:
				194	,'some text',
				195	The quote with the most wins, same with the delimiter.
				196	If there is no quotechar the delimiter can't be determined
				197	this way.
				198	"""
				199
				200	matches = []
				201	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				202	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				203	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				204	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
				205	regexp = re.compile(restr, re.S \| re.M)
				206	matches = regexp.findall(data)
				207	if matches:
				208	break
				209
				210	if not matches:
				211	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				212
				213	quotes = {}
				214	delims = {}
				215	spaces = 0
				216	for m in matches:
				217	n = regexp.groupindex['quote'] - 1
				218	key = m[n]
				219	if key:
				220	quotes[key] = quotes.get(key, 0) + 1
				221	try:
				222	n = regexp.groupindex['delim'] - 1
				223	key = m[n]
				224	except KeyError:
				225	continue
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	226	if key and (delimiters is None or key in delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	227	delims[key] = delims.get(key, 0) + 1
				228	try:
				229	n = regexp.groupindex['space'] - 1
				230	except KeyError:
				231	continue
				232	if m[n]:
				233	spaces += 1
				234
				235	quotechar = reduce(lambda a, b, quotes = quotes:
				236	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				237
				238	if delims:
				239	delim = reduce(lambda a, b, delims = delims:
				240	(delims[a] > delims[b]) and a or b, delims.keys())
				241	skipinitialspace = delims[delim] == spaces
				242	if delim == '\n': # most likely a file with a single column
				243	delim = ''
				244	else:
				245	# there is no delimiter, it's a single column of quoted data
				246	delim = ''
				247	skipinitialspace = 0
				248
				249	return (quotechar, delim, skipinitialspace)
				250
				251
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	252	def _guess_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	253	"""
				254	The delimiter /should/ occur the same number of times on
				255	each row. However, due to malformed data, it may not. We don't want
				256	an all or nothing approach, so we allow for small variations in this
				257	number.
				258	1) build a table of the frequency of each character on every line.
				259	2) build a table of freqencies of this frequency (meta-frequency?),
				260	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				261	7 times in 2 rows'
				262	3) use the mode of the meta-frequency to determine the /expected/
				263	frequency for that character
				264	4) find out how often the character actually meets that goal
				265	5) the character that best meets its goal is the delimiter
				266	For performance reasons, the data is evaluated in chunks, so it can
				267	try and evaluate the smallest portion of the data possible, evaluating
				268	additional chunks as necessary.
				269	"""
				270
				271	data = filter(None, data.split('\n'))
				272
				273	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				274
				275	# build frequency tables
				276	chunkLength = min(10, len(data))
				277	iteration = 0
				278	charFrequency = {}
				279	modes = {}
				280	delims = {}
				281	start, end = 0, min(chunkLength, len(data))
				282	while start < len(data):
				283	iteration += 1
				284	for line in data[start:end]:
				285	for char in ascii:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	286	metaFrequency = charFrequency.get(char, {})
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	287	# must count even if frequency is 0
				288	freq = line.strip().count(char)
				289	# value is the mode
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	290	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				291	charFrequency[char] = metaFrequency
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	292
				293	for char in charFrequency.keys():
				294	items = charFrequency[char].items()
				295	if len(items) == 1 and items[0][0] == 0:
				296	continue
				297	# get the mode of the frequencies
				298	if len(items) > 1:
				299	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				300	items)
				301	# adjust the mode - subtract the sum of all
				302	# other frequencies
				303	items.remove(modes[char])
				304	modes[char] = (modes[char][0], modes[char][1]
				305	- reduce(lambda a, b: (0, a[1] + b[1]),
				306	items)[1])
				307	else:
				308	modes[char] = items[0]
				309
				310	# build a list of possible delimiters
				311	modeList = modes.items()
				312	total = float(chunkLength * iteration)
				313	# (rows of consistent data) / (number of rows) = 100%
				314	consistency = 1.0
				315	# minimum consistency threshold
				316	threshold = 0.9
				317	while len(delims) == 0 and consistency >= threshold:
				318	for k, v in modeList:
				319	if v[0] > 0 and v[1] > 0:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	320	if ((v[1]/total) >= consistency and
				321	(delimiters is None or k in delimiters)):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	322	delims[k] = v
				323	consistency -= 0.01
				324
				325	if len(delims) == 1:
				326	delim = delims.keys()[0]
				327	skipinitialspace = (data[0].count(delim) ==
				328	data[0].count("%c " % delim))
				329	return (delim, skipinitialspace)
				330
				331	# analyze another chunkLength lines
				332	start = end
				333	end += chunkLength
				334
				335	if not delims:
				336	return ('', 0)
				337
				338	# if there's more than one, fall back to a 'preferred' list
				339	if len(delims) > 1:
				340	for d in self.preferred:
				341	if d in delims.keys():
				342	skipinitialspace = (data[0].count(d) ==
				343	data[0].count("%c " % d))
				344	return (d, skipinitialspace)
				345
				346	# finally, just return the first damn character in the list
				347	delim = delims.keys()[0]
				348	skipinitialspace = (data[0].count(delim) ==
				349	data[0].count("%c " % delim))
				350	return (delim, skipinitialspace)
				351
				352
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	353	def has_header(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	354	# Creates a dictionary of types of data in each column. If any
				355	# column is of a single type (say, integers), except for the first
				356	# row, then the first row is presumed to be labels. If the type
				357	# can't be determined, it is assumed to be a string in which case
				358	# the length of the string is the determining factor: if all of the
				359	# rows except for the first are the same length, it's a header.
				360	# Finally, a 'vote' is taken at the end for each column, adding or
				361	# subtracting from the likelihood of the first row being a header.
				362
				363	def seval(item):
				364	"""
				365	Strips parens from item prior to calling eval in an
				366	attempt to make it safer
				367	"""
				368	return eval(item.replace('(', '').replace(')', ''))
				369
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	370	rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	371
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	372	header = rdr.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	373
				374	columns = len(header)
				375	columnTypes = {}
				376	for i in range(columns): columnTypes[i] = None
				377
				378	checked = 0
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	379	for row in rdr:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	380	# arbitrary number of rows to check, to keep it sane
				381	if checked > 20:
				382	break
				383	checked += 1
				384
				385	if len(row) != columns:
				386	continue # skip rows that have irregular number of columns
				387
				388	for col in columnTypes.keys():
				389	try:
				390	try:
				391	# is it a built-in type (besides string)?
				392	thisType = type(seval(row[col]))
				393	except OverflowError:
				394	# a long int?
				395	thisType = type(seval(row[col] + 'L'))
				396	thisType = type(0) # treat long ints as int
				397	except:
				398	# fallback to length of string
				399	thisType = len(row[col])
				400
				401	if thisType != columnTypes[col]:
				402	if columnTypes[col] is None: # add new column type
				403	columnTypes[col] = thisType
				404	else:
				405	# type is inconsistent, remove column from
				406	# consideration
				407	del columnTypes[col]
				408
				409	# finally, compare results against first row and "vote"
				410	# on whether it's a header
				411	hasHeader = 0
				412	for col, colType in columnTypes.items():
				413	if type(colType) == type(0): # it's a length
				414	if len(header[col]) != colType:
				415	hasHeader += 1
				416	else:
				417	hasHeader -= 1
				418	else: # attempt typecast
				419	try:
				420	eval("%s(%s)" % (colType.__name__, header[col]))
				421	except:
				422	hasHeader += 1
				423	else:
				424	hasHeader -= 1
				425
				426	return hasHeader > 0