Blame - Lib/csv.py - platform/external/python/cpython2

blob: 89d86d6ce02ebbb13bbdfdfc7f61523924eda95a [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
				12	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				13	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				14	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				15	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				16
				17	class Dialect:
				18	_name = ""
				19	_valid = False
				20	# placeholders
				21	delimiter = None
				22	quotechar = None
				23	escapechar = None
				24	doublequote = None
				25	skipinitialspace = None
				26	lineterminator = None
				27	quoting = None
				28
				29	def __init__(self):
				30	if self.__class__ != Dialect:
				31	self._valid = True
				32	errors = self._validate()
				33	if errors != []:
				34	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				35
				36	def _validate(self):
				37	errors = []
				38	if not self._valid:
				39	errors.append("can't directly instantiate Dialect class")
				40
				41	if self.delimiter is None:
				42	errors.append("delimiter character not set")
				43	elif (not isinstance(self.delimiter, str) or
				44	len(self.delimiter) > 1):
				45	errors.append("delimiter must be one-character string")
				46
				47	if self.quotechar is None:
				48	if self.quoting != QUOTE_NONE:
				49	errors.append("quotechar not set")
				50	elif (not isinstance(self.quotechar, str) or
				51	len(self.quotechar) > 1):
				52	errors.append("quotechar must be one-character string")
				53
				54	if self.lineterminator is None:
				55	errors.append("lineterminator not set")
				56	elif not isinstance(self.lineterminator, str):
				57	errors.append("lineterminator must be a string")
				58
				59	if self.doublequote not in (True, False):
				60	errors.append("doublequote parameter must be True or False")
				61
				62	if self.skipinitialspace not in (True, False):
				63	errors.append("skipinitialspace parameter must be True or False")
				64
				65	if self.quoting is None:
				66	errors.append("quoting parameter not set")
				67
				68	if self.quoting is QUOTE_NONE:
				69	if (not isinstance(self.escapechar, (unicode, str)) or
				70	len(self.escapechar) > 1):
				71	errors.append("escapechar must be a one-character string or unicode object")
				72
				73	return errors
				74
				75	class excel(Dialect):
				76	delimiter = ','
				77	quotechar = '"'
				78	doublequote = True
				79	skipinitialspace = False
				80	lineterminator = '\r\n'
				81	quoting = QUOTE_MINIMAL
				82	register_dialect("excel", excel)
				83
				84	class excel_tab(excel):
				85	delimiter = '\t'
				86	register_dialect("excel-tab", excel_tab)
				87
				88
				89	class DictReader:
				90	def __init__(self, f, fieldnames, restkey=None, restval=None,
				91	dialect="excel", *args):
				92	self.fieldnames = fieldnames # list of keys for the dict
				93	self.restkey = restkey # key to catch long rows
				94	self.restval = restval # default value for short rows
				95	self.reader = reader(f, dialect, *args)
				96
				97	def __iter__(self):
				98	return self
				99
				100	def next(self):
				101	row = self.reader.next()
				102	# unlike the basic reader, we prefer not to return blanks,
				103	# because we will typically wind up with a dict full of None
				104	# values
				105	while row == []:
				106	row = self.reader.next()
				107	d = dict(zip(self.fieldnames, row))
				108	lf = len(self.fieldnames)
				109	lr = len(row)
				110	if lf < lr:
				111	d[self.restkey] = row[lf:]
				112	elif lf > lr:
				113	for key in self.fieldnames[lr:]:
				114	d[key] = self.restval
				115	return d
				116
				117
				118	class DictWriter:
				119	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
				120	dialect="excel", *args):
				121	self.fieldnames = fieldnames # list of keys for the dict
				122	self.restval = restval # for writing short dicts
				123	if extrasaction.lower() not in ("raise", "ignore"):
				124	raise ValueError, \
				125	("extrasaction (%s) must be 'raise' or 'ignore'" %
				126	extrasaction)
				127	self.extrasaction = extrasaction
				128	self.writer = writer(f, dialect, *args)
				129
				130	def _dict_to_list(self, rowdict):
				131	if self.extrasaction == "raise":
				132	for k in rowdict.keys():
				133	if k not in self.fieldnames:
				134	raise ValueError, "dict contains fields not in fieldnames"
				135	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				136
				137	def writerow(self, rowdict):
				138	return self.writer.writerow(self._dict_to_list(rowdict))
				139
				140	def writerows(self, rowdicts):
				141	rows = []
				142	for rowdict in rowdicts:
				143	rows.append(self._dict_to_list(rowdict))
				144	return self.writer.writerows(rows)
				145
				146
				147	class Sniffer:
				148	'''
				149	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
				150	Returns a csv.Dialect object.
				151	'''
				152	def __init__(self, sample = 16 * 1024):
				153	# in case there is more than one possible delimiter
				154	self.preferred = [',', '\t', ';', ' ', ':']
				155
				156	# amount of data (in bytes) to sample
				157	self.sample = sample
				158
				159
				160	def sniff(self, fileobj):
				161	"""
				162	Takes a file-like object and returns a dialect (or None)
				163	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	164	self.fileobj = fileobj
				165
				166	data = fileobj.read(self.sample)
				167
				168	quotechar, delimiter, skipinitialspace = \
				169	self._guessQuoteAndDelimiter(data)
				170	if delimiter is None:
				171	delimiter, skipinitialspace = self._guessDelimiter(data)
				172
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	173	class SniffedDialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	174	_name = "sniffed"
				175	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	176	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	177	# escapechar = ''
				178	doublequote = False
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	179	SniffedDialect.delimiter = delimiter
				180	SniffedDialect.quotechar = quotechar
				181	SniffedDialect.skipinitialspace = skipinitialspace
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	182
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	183	self.dialect = SniffedDialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	184	return self.dialect
				185
				186
				187	def hasHeaders(self):
				188	return self._hasHeaders(self.fileobj, self.dialect)
				189
				190
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	191	def register_dialect(self, name='sniffed'):
				192	register_dialect(name, self.dialect)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	193
				194
				195	def _guessQuoteAndDelimiter(self, data):
				196	"""
				197	Looks for text enclosed between two identical quotes
				198	(the probable quotechar) which are preceded and followed
				199	by the same character (the probable delimiter).
				200	For example:
				201	,'some text',
				202	The quote with the most wins, same with the delimiter.
				203	If there is no quotechar the delimiter can't be determined
				204	this way.
				205	"""
				206
				207	matches = []
				208	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				209	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				210	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				211	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
				212	regexp = re.compile(restr, re.S \| re.M)
				213	matches = regexp.findall(data)
				214	if matches:
				215	break
				216
				217	if not matches:
				218	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				219
				220	quotes = {}
				221	delims = {}
				222	spaces = 0
				223	for m in matches:
				224	n = regexp.groupindex['quote'] - 1
				225	key = m[n]
				226	if key:
				227	quotes[key] = quotes.get(key, 0) + 1
				228	try:
				229	n = regexp.groupindex['delim'] - 1
				230	key = m[n]
				231	except KeyError:
				232	continue
				233	if key:
				234	delims[key] = delims.get(key, 0) + 1
				235	try:
				236	n = regexp.groupindex['space'] - 1
				237	except KeyError:
				238	continue
				239	if m[n]:
				240	spaces += 1
				241
				242	quotechar = reduce(lambda a, b, quotes = quotes:
				243	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				244
				245	if delims:
				246	delim = reduce(lambda a, b, delims = delims:
				247	(delims[a] > delims[b]) and a or b, delims.keys())
				248	skipinitialspace = delims[delim] == spaces
				249	if delim == '\n': # most likely a file with a single column
				250	delim = ''
				251	else:
				252	# there is no delimiter, it's a single column of quoted data
				253	delim = ''
				254	skipinitialspace = 0
				255
				256	return (quotechar, delim, skipinitialspace)
				257
				258
				259	def _guessDelimiter(self, data):
				260	"""
				261	The delimiter /should/ occur the same number of times on
				262	each row. However, due to malformed data, it may not. We don't want
				263	an all or nothing approach, so we allow for small variations in this
				264	number.
				265	1) build a table of the frequency of each character on every line.
				266	2) build a table of freqencies of this frequency (meta-frequency?),
				267	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				268	7 times in 2 rows'
				269	3) use the mode of the meta-frequency to determine the /expected/
				270	frequency for that character
				271	4) find out how often the character actually meets that goal
				272	5) the character that best meets its goal is the delimiter
				273	For performance reasons, the data is evaluated in chunks, so it can
				274	try and evaluate the smallest portion of the data possible, evaluating
				275	additional chunks as necessary.
				276	"""
				277
				278	data = filter(None, data.split('\n'))
				279
				280	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				281
				282	# build frequency tables
				283	chunkLength = min(10, len(data))
				284	iteration = 0
				285	charFrequency = {}
				286	modes = {}
				287	delims = {}
				288	start, end = 0, min(chunkLength, len(data))
				289	while start < len(data):
				290	iteration += 1
				291	for line in data[start:end]:
				292	for char in ascii:
				293	metafrequency = charFrequency.get(char, {})
				294	# must count even if frequency is 0
				295	freq = line.strip().count(char)
				296	# value is the mode
				297	metafrequency[freq] = metafrequency.get(freq, 0) + 1
				298	charFrequency[char] = metafrequency
				299
				300	for char in charFrequency.keys():
				301	items = charFrequency[char].items()
				302	if len(items) == 1 and items[0][0] == 0:
				303	continue
				304	# get the mode of the frequencies
				305	if len(items) > 1:
				306	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				307	items)
				308	# adjust the mode - subtract the sum of all
				309	# other frequencies
				310	items.remove(modes[char])
				311	modes[char] = (modes[char][0], modes[char][1]
				312	- reduce(lambda a, b: (0, a[1] + b[1]),
				313	items)[1])
				314	else:
				315	modes[char] = items[0]
				316
				317	# build a list of possible delimiters
				318	modeList = modes.items()
				319	total = float(chunkLength * iteration)
				320	# (rows of consistent data) / (number of rows) = 100%
				321	consistency = 1.0
				322	# minimum consistency threshold
				323	threshold = 0.9
				324	while len(delims) == 0 and consistency >= threshold:
				325	for k, v in modeList:
				326	if v[0] > 0 and v[1] > 0:
				327	if (v[1]/total) >= consistency:
				328	delims[k] = v
				329	consistency -= 0.01
				330
				331	if len(delims) == 1:
				332	delim = delims.keys()[0]
				333	skipinitialspace = (data[0].count(delim) ==
				334	data[0].count("%c " % delim))
				335	return (delim, skipinitialspace)
				336
				337	# analyze another chunkLength lines
				338	start = end
				339	end += chunkLength
				340
				341	if not delims:
				342	return ('', 0)
				343
				344	# if there's more than one, fall back to a 'preferred' list
				345	if len(delims) > 1:
				346	for d in self.preferred:
				347	if d in delims.keys():
				348	skipinitialspace = (data[0].count(d) ==
				349	data[0].count("%c " % d))
				350	return (d, skipinitialspace)
				351
				352	# finally, just return the first damn character in the list
				353	delim = delims.keys()[0]
				354	skipinitialspace = (data[0].count(delim) ==
				355	data[0].count("%c " % delim))
				356	return (delim, skipinitialspace)
				357
				358
				359	def _hasHeaders(self, fileobj, dialect):
				360	# Creates a dictionary of types of data in each column. If any
				361	# column is of a single type (say, integers), except for the first
				362	# row, then the first row is presumed to be labels. If the type
				363	# can't be determined, it is assumed to be a string in which case
				364	# the length of the string is the determining factor: if all of the
				365	# rows except for the first are the same length, it's a header.
				366	# Finally, a 'vote' is taken at the end for each column, adding or
				367	# subtracting from the likelihood of the first row being a header.
				368
				369	def seval(item):
				370	"""
				371	Strips parens from item prior to calling eval in an
				372	attempt to make it safer
				373	"""
				374	return eval(item.replace('(', '').replace(')', ''))
				375
				376	# rewind the fileobj - this might not work for some file-like
				377	# objects...
				378	fileobj.seek(0)
				379
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	380	r = csv.reader(fileobj,
				381	delimiter=dialect.delimiter,
				382	quotechar=dialect.quotechar,
				383	skipinitialspace=dialect.skipinitialspace)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	384
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	385	header = r.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	386
				387	columns = len(header)
				388	columnTypes = {}
				389	for i in range(columns): columnTypes[i] = None
				390
				391	checked = 0
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	392	for row in r:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	393	# arbitrary number of rows to check, to keep it sane
				394	if checked > 20:
				395	break
				396	checked += 1
				397
				398	if len(row) != columns:
				399	continue # skip rows that have irregular number of columns
				400
				401	for col in columnTypes.keys():
				402	try:
				403	try:
				404	# is it a built-in type (besides string)?
				405	thisType = type(seval(row[col]))
				406	except OverflowError:
				407	# a long int?
				408	thisType = type(seval(row[col] + 'L'))
				409	thisType = type(0) # treat long ints as int
				410	except:
				411	# fallback to length of string
				412	thisType = len(row[col])
				413
				414	if thisType != columnTypes[col]:
				415	if columnTypes[col] is None: # add new column type
				416	columnTypes[col] = thisType
				417	else:
				418	# type is inconsistent, remove column from
				419	# consideration
				420	del columnTypes[col]
				421
				422	# finally, compare results against first row and "vote"
				423	# on whether it's a header
				424	hasHeader = 0
				425	for col, colType in columnTypes.items():
				426	if type(colType) == type(0): # it's a length
				427	if len(header[col]) != colType:
				428	hasHeader += 1
				429	else:
				430	hasHeader -= 1
				431	else: # attempt typecast
				432	try:
				433	eval("%s(%s)" % (colType.__name__, header[col]))
				434	except:
				435	hasHeader += 1
				436	else:
				437	hasHeader -= 1
				438
				439	return hasHeader > 0