Blame - Lib/csv.py - platform/external/python/cpython3

blob: fcb8a8fedcea790cb4c1759d80142a691d8fedfc [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	12	try:
				13	from cStringIO import StringIO
				14	except ImportError:
				15	from StringIO import StringIO
				16
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				21
				22	class Dialect:
				23	_name = ""
				24	_valid = False
				25	# placeholders
				26	delimiter = None
				27	quotechar = None
				28	escapechar = None
				29	doublequote = None
				30	skipinitialspace = None
				31	lineterminator = None
				32	quoting = None
				33
				34	def __init__(self):
				35	if self.__class__ != Dialect:
				36	self._valid = True
				37	errors = self._validate()
				38	if errors != []:
				39	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				40
				41	def _validate(self):
				42	errors = []
				43	if not self._valid:
				44	errors.append("can't directly instantiate Dialect class")
				45
				46	if self.delimiter is None:
				47	errors.append("delimiter character not set")
				48	elif (not isinstance(self.delimiter, str) or
				49	len(self.delimiter) > 1):
				50	errors.append("delimiter must be one-character string")
				51
				52	if self.quotechar is None:
				53	if self.quoting != QUOTE_NONE:
				54	errors.append("quotechar not set")
				55	elif (not isinstance(self.quotechar, str) or
				56	len(self.quotechar) > 1):
				57	errors.append("quotechar must be one-character string")
				58
				59	if self.lineterminator is None:
				60	errors.append("lineterminator not set")
				61	elif not isinstance(self.lineterminator, str):
				62	errors.append("lineterminator must be a string")
				63
				64	if self.doublequote not in (True, False):
				65	errors.append("doublequote parameter must be True or False")
				66
				67	if self.skipinitialspace not in (True, False):
				68	errors.append("skipinitialspace parameter must be True or False")
				69
				70	if self.quoting is None:
				71	errors.append("quoting parameter not set")
				72
				73	if self.quoting is QUOTE_NONE:
				74	if (not isinstance(self.escapechar, (unicode, str)) or
				75	len(self.escapechar) > 1):
				76	errors.append("escapechar must be a one-character string or unicode object")
				77
				78	return errors
				79
				80	class excel(Dialect):
				81	delimiter = ','
				82	quotechar = '"'
				83	doublequote = True
				84	skipinitialspace = False
				85	lineterminator = '\r\n'
				86	quoting = QUOTE_MINIMAL
				87	register_dialect("excel", excel)
				88
				89	class excel_tab(excel):
				90	delimiter = '\t'
				91	register_dialect("excel-tab", excel_tab)
				92
				93
				94	class DictReader:
				95	def __init__(self, f, fieldnames, restkey=None, restval=None,
				96	dialect="excel", *args):
				97	self.fieldnames = fieldnames # list of keys for the dict
				98	self.restkey = restkey # key to catch long rows
				99	self.restval = restval # default value for short rows
				100	self.reader = reader(f, dialect, *args)
				101
				102	def __iter__(self):
				103	return self
				104
				105	def next(self):
				106	row = self.reader.next()
				107	# unlike the basic reader, we prefer not to return blanks,
				108	# because we will typically wind up with a dict full of None
				109	# values
				110	while row == []:
				111	row = self.reader.next()
				112	d = dict(zip(self.fieldnames, row))
				113	lf = len(self.fieldnames)
				114	lr = len(row)
				115	if lf < lr:
				116	d[self.restkey] = row[lf:]
				117	elif lf > lr:
				118	for key in self.fieldnames[lr:]:
				119	d[key] = self.restval
				120	return d
				121
				122
				123	class DictWriter:
				124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
				125	dialect="excel", *args):
				126	self.fieldnames = fieldnames # list of keys for the dict
				127	self.restval = restval # for writing short dicts
				128	if extrasaction.lower() not in ("raise", "ignore"):
				129	raise ValueError, \
				130	("extrasaction (%s) must be 'raise' or 'ignore'" %
				131	extrasaction)
				132	self.extrasaction = extrasaction
				133	self.writer = writer(f, dialect, *args)
				134
				135	def _dict_to_list(self, rowdict):
				136	if self.extrasaction == "raise":
				137	for k in rowdict.keys():
				138	if k not in self.fieldnames:
				139	raise ValueError, "dict contains fields not in fieldnames"
				140	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				141
				142	def writerow(self, rowdict):
				143	return self.writer.writerow(self._dict_to_list(rowdict))
				144
				145	def writerows(self, rowdicts):
				146	rows = []
				147	for rowdict in rowdicts:
				148	rows.append(self._dict_to_list(rowdict))
				149	return self.writer.writerows(rows)
				150
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	151	# Guard Sniffer's type checking against builds that exclude complex()
				152	try:
				153	complex
				154	except NameError:
				155	complex = float
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	156
				157	class Sniffer:
				158	'''
				159	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	160	Returns a Dialect object.
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	161	'''
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	162	def __init__(self):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	163	# in case there is more than one possible delimiter
				164	self.preferred = [',', '\t', ';', ' ', ':']
				165
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	166
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	167	def sniff(self, sample, delimiters=None):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	168	"""
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	169	Returns a dialect (or None) corresponding to the sample
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	170	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	171
				172	quotechar, delimiter, skipinitialspace = \
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	173	self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	174	if delimiter is None:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	175	delimiter, skipinitialspace = self._guess_delimiter(sample,
				176	delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	177
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	178	class dialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	179	_name = "sniffed"
				180	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	181	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	182	# escapechar = ''
				183	doublequote = False
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	184
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	185	dialect.delimiter = delimiter
				186	# _csv.reader won't accept a quotechar of ''
				187	dialect.quotechar = quotechar or '"'
				188	dialect.skipinitialspace = skipinitialspace
				189
				190	return dialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	191
				192
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	193	def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	194	"""
				195	Looks for text enclosed between two identical quotes
				196	(the probable quotechar) which are preceded and followed
				197	by the same character (the probable delimiter).
				198	For example:
				199	,'some text',
				200	The quote with the most wins, same with the delimiter.
				201	If there is no quotechar the delimiter can't be determined
				202	this way.
				203	"""
				204
				205	matches = []
				206	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				207	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				208	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				209	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
Fred Drake	6f7b213	2003-09-02 16:01:07 +0000	[diff] [blame^]	210	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	211	matches = regexp.findall(data)
				212	if matches:
				213	break
				214
				215	if not matches:
				216	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				217
				218	quotes = {}
				219	delims = {}
				220	spaces = 0
				221	for m in matches:
				222	n = regexp.groupindex['quote'] - 1
				223	key = m[n]
				224	if key:
				225	quotes[key] = quotes.get(key, 0) + 1
				226	try:
				227	n = regexp.groupindex['delim'] - 1
				228	key = m[n]
				229	except KeyError:
				230	continue
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	231	if key and (delimiters is None or key in delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	232	delims[key] = delims.get(key, 0) + 1
				233	try:
				234	n = regexp.groupindex['space'] - 1
				235	except KeyError:
				236	continue
				237	if m[n]:
				238	spaces += 1
				239
				240	quotechar = reduce(lambda a, b, quotes = quotes:
				241	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				242
				243	if delims:
				244	delim = reduce(lambda a, b, delims = delims:
				245	(delims[a] > delims[b]) and a or b, delims.keys())
				246	skipinitialspace = delims[delim] == spaces
				247	if delim == '\n': # most likely a file with a single column
				248	delim = ''
				249	else:
				250	# there is no delimiter, it's a single column of quoted data
				251	delim = ''
				252	skipinitialspace = 0
				253
				254	return (quotechar, delim, skipinitialspace)
				255
				256
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	257	def _guess_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	258	"""
				259	The delimiter /should/ occur the same number of times on
				260	each row. However, due to malformed data, it may not. We don't want
				261	an all or nothing approach, so we allow for small variations in this
				262	number.
				263	1) build a table of the frequency of each character on every line.
				264	2) build a table of freqencies of this frequency (meta-frequency?),
				265	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				266	7 times in 2 rows'
				267	3) use the mode of the meta-frequency to determine the /expected/
				268	frequency for that character
				269	4) find out how often the character actually meets that goal
				270	5) the character that best meets its goal is the delimiter
				271	For performance reasons, the data is evaluated in chunks, so it can
				272	try and evaluate the smallest portion of the data possible, evaluating
				273	additional chunks as necessary.
				274	"""
				275
				276	data = filter(None, data.split('\n'))
				277
				278	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				279
				280	# build frequency tables
				281	chunkLength = min(10, len(data))
				282	iteration = 0
				283	charFrequency = {}
				284	modes = {}
				285	delims = {}
				286	start, end = 0, min(chunkLength, len(data))
				287	while start < len(data):
				288	iteration += 1
				289	for line in data[start:end]:
				290	for char in ascii:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	291	metaFrequency = charFrequency.get(char, {})
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	292	# must count even if frequency is 0
				293	freq = line.strip().count(char)
				294	# value is the mode
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	295	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				296	charFrequency[char] = metaFrequency
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	297
				298	for char in charFrequency.keys():
				299	items = charFrequency[char].items()
				300	if len(items) == 1 and items[0][0] == 0:
				301	continue
				302	# get the mode of the frequencies
				303	if len(items) > 1:
				304	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				305	items)
				306	# adjust the mode - subtract the sum of all
				307	# other frequencies
				308	items.remove(modes[char])
				309	modes[char] = (modes[char][0], modes[char][1]
				310	- reduce(lambda a, b: (0, a[1] + b[1]),
				311	items)[1])
				312	else:
				313	modes[char] = items[0]
				314
				315	# build a list of possible delimiters
				316	modeList = modes.items()
				317	total = float(chunkLength * iteration)
				318	# (rows of consistent data) / (number of rows) = 100%
				319	consistency = 1.0
				320	# minimum consistency threshold
				321	threshold = 0.9
				322	while len(delims) == 0 and consistency >= threshold:
				323	for k, v in modeList:
				324	if v[0] > 0 and v[1] > 0:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	325	if ((v[1]/total) >= consistency and
				326	(delimiters is None or k in delimiters)):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	327	delims[k] = v
				328	consistency -= 0.01
				329
				330	if len(delims) == 1:
				331	delim = delims.keys()[0]
				332	skipinitialspace = (data[0].count(delim) ==
				333	data[0].count("%c " % delim))
				334	return (delim, skipinitialspace)
				335
				336	# analyze another chunkLength lines
				337	start = end
				338	end += chunkLength
				339
				340	if not delims:
				341	return ('', 0)
				342
				343	# if there's more than one, fall back to a 'preferred' list
				344	if len(delims) > 1:
				345	for d in self.preferred:
				346	if d in delims.keys():
				347	skipinitialspace = (data[0].count(d) ==
				348	data[0].count("%c " % d))
				349	return (d, skipinitialspace)
				350
				351	# finally, just return the first damn character in the list
				352	delim = delims.keys()[0]
				353	skipinitialspace = (data[0].count(delim) ==
				354	data[0].count("%c " % delim))
				355	return (delim, skipinitialspace)
				356
				357
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	358	def has_header(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	359	# Creates a dictionary of types of data in each column. If any
				360	# column is of a single type (say, integers), except for the first
				361	# row, then the first row is presumed to be labels. If the type
				362	# can't be determined, it is assumed to be a string in which case
				363	# the length of the string is the determining factor: if all of the
				364	# rows except for the first are the same length, it's a header.
				365	# Finally, a 'vote' is taken at the end for each column, adding or
				366	# subtracting from the likelihood of the first row being a header.
				367
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	368	rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	369
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	370	header = rdr.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	371
				372	columns = len(header)
				373	columnTypes = {}
				374	for i in range(columns): columnTypes[i] = None
				375
				376	checked = 0
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	377	for row in rdr:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	378	# arbitrary number of rows to check, to keep it sane
				379	if checked > 20:
				380	break
				381	checked += 1
				382
				383	if len(row) != columns:
				384	continue # skip rows that have irregular number of columns
				385
				386	for col in columnTypes.keys():
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	387
				388	for thisType in [int, long, float, complex]:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	389	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	390	thisType(row[col])
				391	break
Raymond Hettinger	abe14e6	2003-06-12 03:59:17 +0000	[diff] [blame]	392	except (ValueError, OverflowError):
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	393	pass
				394	else:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	395	# fallback to length of string
				396	thisType = len(row[col])
				397
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	398	# treat longs as ints
				399	if thisType == long:
				400	thisType = int
				401
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	402	if thisType != columnTypes[col]:
				403	if columnTypes[col] is None: # add new column type
				404	columnTypes[col] = thisType
				405	else:
				406	# type is inconsistent, remove column from
				407	# consideration
				408	del columnTypes[col]
				409
				410	# finally, compare results against first row and "vote"
				411	# on whether it's a header
				412	hasHeader = 0
				413	for col, colType in columnTypes.items():
				414	if type(colType) == type(0): # it's a length
				415	if len(header[col]) != colType:
				416	hasHeader += 1
				417	else:
				418	hasHeader -= 1
				419	else: # attempt typecast
				420	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	421	colType(header[col])
Raymond Hettinger	f31cb0c	2003-06-12 04:05:00 +0000	[diff] [blame]	422	except (ValueError, TypeError):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	423	hasHeader += 1
				424	else:
				425	hasHeader -= 1
				426
				427	return hasHeader > 0