Blame - Lib/csv.py - platform/external/python/cpython3

blob: f2389fd30425944a39163b92ae7b36f118efe300 [file] [log] [blame]

Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				10	__doc__
				11
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	12	try:
				13	from cStringIO import StringIO
				14	except ImportError:
				15	from StringIO import StringIO
				16
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				21
				22	class Dialect:
				23	_name = ""
				24	_valid = False
				25	# placeholders
				26	delimiter = None
				27	quotechar = None
				28	escapechar = None
				29	doublequote = None
				30	skipinitialspace = None
				31	lineterminator = None
				32	quoting = None
				33
				34	def __init__(self):
				35	if self.__class__ != Dialect:
				36	self._valid = True
				37	errors = self._validate()
				38	if errors != []:
				39	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
				40
				41	def _validate(self):
				42	errors = []
				43	if not self._valid:
				44	errors.append("can't directly instantiate Dialect class")
				45
				46	if self.delimiter is None:
				47	errors.append("delimiter character not set")
				48	elif (not isinstance(self.delimiter, str) or
				49	len(self.delimiter) > 1):
				50	errors.append("delimiter must be one-character string")
				51
				52	if self.quotechar is None:
				53	if self.quoting != QUOTE_NONE:
				54	errors.append("quotechar not set")
				55	elif (not isinstance(self.quotechar, str) or
				56	len(self.quotechar) > 1):
				57	errors.append("quotechar must be one-character string")
				58
				59	if self.lineterminator is None:
				60	errors.append("lineterminator not set")
				61	elif not isinstance(self.lineterminator, str):
				62	errors.append("lineterminator must be a string")
				63
				64	if self.doublequote not in (True, False):
				65	errors.append("doublequote parameter must be True or False")
				66
				67	if self.skipinitialspace not in (True, False):
				68	errors.append("skipinitialspace parameter must be True or False")
				69
				70	if self.quoting is None:
				71	errors.append("quoting parameter not set")
				72
				73	if self.quoting is QUOTE_NONE:
				74	if (not isinstance(self.escapechar, (unicode, str)) or
				75	len(self.escapechar) > 1):
				76	errors.append("escapechar must be a one-character string or unicode object")
				77
				78	return errors
				79
				80	class excel(Dialect):
				81	delimiter = ','
				82	quotechar = '"'
				83	doublequote = True
				84	skipinitialspace = False
				85	lineterminator = '\r\n'
				86	quoting = QUOTE_MINIMAL
				87	register_dialect("excel", excel)
				88
				89	class excel_tab(excel):
				90	delimiter = '\t'
				91	register_dialect("excel-tab", excel_tab)
				92
				93
				94	class DictReader:
Skip Montanaro	dffeed3	2003-10-03 14:03:01 +0000	[diff] [blame^]	95	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	96	dialect="excel", args, *kwds):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	97	self.fieldnames = fieldnames # list of keys for the dict
				98	self.restkey = restkey # key to catch long rows
				99	self.restval = restval # default value for short rows
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	100	self.reader = reader(f, dialect, args, *kwds)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	101
				102	def __iter__(self):
				103	return self
				104
				105	def next(self):
				106	row = self.reader.next()
Skip Montanaro	dffeed3	2003-10-03 14:03:01 +0000	[diff] [blame^]	107	if self.fieldnames is None:
				108	self.fieldnames = row
				109	row = self.reader.next()
				110
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	111	# unlike the basic reader, we prefer not to return blanks,
				112	# because we will typically wind up with a dict full of None
				113	# values
				114	while row == []:
				115	row = self.reader.next()
				116	d = dict(zip(self.fieldnames, row))
				117	lf = len(self.fieldnames)
				118	lr = len(row)
				119	if lf < lr:
				120	d[self.restkey] = row[lf:]
				121	elif lf > lr:
				122	for key in self.fieldnames[lr:]:
				123	d[key] = self.restval
				124	return d
				125
				126
				127	class DictWriter:
				128	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	129	dialect="excel", args, *kwds):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	130	self.fieldnames = fieldnames # list of keys for the dict
				131	self.restval = restval # for writing short dicts
				132	if extrasaction.lower() not in ("raise", "ignore"):
				133	raise ValueError, \
				134	("extrasaction (%s) must be 'raise' or 'ignore'" %
				135	extrasaction)
				136	self.extrasaction = extrasaction
Skip Montanaro	3f7a948	2003-09-06 19:52:12 +0000	[diff] [blame]	137	self.writer = writer(f, dialect, args, *kwds)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	138
				139	def _dict_to_list(self, rowdict):
				140	if self.extrasaction == "raise":
				141	for k in rowdict.keys():
				142	if k not in self.fieldnames:
				143	raise ValueError, "dict contains fields not in fieldnames"
				144	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				145
				146	def writerow(self, rowdict):
				147	return self.writer.writerow(self._dict_to_list(rowdict))
				148
				149	def writerows(self, rowdicts):
				150	rows = []
				151	for rowdict in rowdicts:
				152	rows.append(self._dict_to_list(rowdict))
				153	return self.writer.writerows(rows)
				154
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	155	# Guard Sniffer's type checking against builds that exclude complex()
				156	try:
				157	complex
				158	except NameError:
				159	complex = float
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	160
				161	class Sniffer:
				162	'''
				163	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	164	Returns a Dialect object.
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	165	'''
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	166	def __init__(self):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	167	# in case there is more than one possible delimiter
				168	self.preferred = [',', '\t', ';', ' ', ':']
				169
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	170
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	171	def sniff(self, sample, delimiters=None):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	172	"""
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	173	Returns a dialect (or None) corresponding to the sample
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	174	"""
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	175
				176	quotechar, delimiter, skipinitialspace = \
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	177	self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	178	if delimiter is None:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	179	delimiter, skipinitialspace = self._guess_delimiter(sample,
				180	delimiters)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	181
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	182	class dialect(Dialect):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	183	_name = "sniffed"
				184	lineterminator = '\r\n'
Fred Drake	7c852f3	2003-04-25 14:27:00 +0000	[diff] [blame]	185	quoting = QUOTE_MINIMAL
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	186	# escapechar = ''
				187	doublequote = False
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	188
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	189	dialect.delimiter = delimiter
				190	# _csv.reader won't accept a quotechar of ''
				191	dialect.quotechar = quotechar or '"'
				192	dialect.skipinitialspace = skipinitialspace
				193
				194	return dialect
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	195
				196
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	197	def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	198	"""
				199	Looks for text enclosed between two identical quotes
				200	(the probable quotechar) which are preceded and followed
				201	by the same character (the probable delimiter).
				202	For example:
				203	,'some text',
				204	The quote with the most wins, same with the delimiter.
				205	If there is no quotechar the delimiter can't be determined
				206	this way.
				207	"""
				208
				209	matches = []
				210	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				211	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				212	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				213	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
Fred Drake	6f7b213	2003-09-02 16:01:07 +0000	[diff] [blame]	214	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	215	matches = regexp.findall(data)
				216	if matches:
				217	break
				218
				219	if not matches:
				220	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				221
				222	quotes = {}
				223	delims = {}
				224	spaces = 0
				225	for m in matches:
				226	n = regexp.groupindex['quote'] - 1
				227	key = m[n]
				228	if key:
				229	quotes[key] = quotes.get(key, 0) + 1
				230	try:
				231	n = regexp.groupindex['delim'] - 1
				232	key = m[n]
				233	except KeyError:
				234	continue
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	235	if key and (delimiters is None or key in delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	236	delims[key] = delims.get(key, 0) + 1
				237	try:
				238	n = regexp.groupindex['space'] - 1
				239	except KeyError:
				240	continue
				241	if m[n]:
				242	spaces += 1
				243
				244	quotechar = reduce(lambda a, b, quotes = quotes:
				245	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				246
				247	if delims:
				248	delim = reduce(lambda a, b, delims = delims:
				249	(delims[a] > delims[b]) and a or b, delims.keys())
				250	skipinitialspace = delims[delim] == spaces
				251	if delim == '\n': # most likely a file with a single column
				252	delim = ''
				253	else:
				254	# there is no delimiter, it's a single column of quoted data
				255	delim = ''
				256	skipinitialspace = 0
				257
				258	return (quotechar, delim, skipinitialspace)
				259
				260
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	261	def _guess_delimiter(self, data, delimiters):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	262	"""
				263	The delimiter /should/ occur the same number of times on
				264	each row. However, due to malformed data, it may not. We don't want
				265	an all or nothing approach, so we allow for small variations in this
				266	number.
				267	1) build a table of the frequency of each character on every line.
				268	2) build a table of freqencies of this frequency (meta-frequency?),
				269	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				270	7 times in 2 rows'
				271	3) use the mode of the meta-frequency to determine the /expected/
				272	frequency for that character
				273	4) find out how often the character actually meets that goal
				274	5) the character that best meets its goal is the delimiter
				275	For performance reasons, the data is evaluated in chunks, so it can
				276	try and evaluate the smallest portion of the data possible, evaluating
				277	additional chunks as necessary.
				278	"""
				279
				280	data = filter(None, data.split('\n'))
				281
				282	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				283
				284	# build frequency tables
				285	chunkLength = min(10, len(data))
				286	iteration = 0
				287	charFrequency = {}
				288	modes = {}
				289	delims = {}
				290	start, end = 0, min(chunkLength, len(data))
				291	while start < len(data):
				292	iteration += 1
				293	for line in data[start:end]:
				294	for char in ascii:
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	295	metaFrequency = charFrequency.get(char, {})
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	296	# must count even if frequency is 0
				297	freq = line.strip().count(char)
				298	# value is the mode
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	299	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				300	charFrequency[char] = metaFrequency
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	301
				302	for char in charFrequency.keys():
				303	items = charFrequency[char].items()
				304	if len(items) == 1 and items[0][0] == 0:
				305	continue
				306	# get the mode of the frequencies
				307	if len(items) > 1:
				308	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				309	items)
				310	# adjust the mode - subtract the sum of all
				311	# other frequencies
				312	items.remove(modes[char])
				313	modes[char] = (modes[char][0], modes[char][1]
				314	- reduce(lambda a, b: (0, a[1] + b[1]),
				315	items)[1])
				316	else:
				317	modes[char] = items[0]
				318
				319	# build a list of possible delimiters
				320	modeList = modes.items()
				321	total = float(chunkLength * iteration)
				322	# (rows of consistent data) / (number of rows) = 100%
				323	consistency = 1.0
				324	# minimum consistency threshold
				325	threshold = 0.9
				326	while len(delims) == 0 and consistency >= threshold:
				327	for k, v in modeList:
				328	if v[0] > 0 and v[1] > 0:
Skip Montanaro	7789237	2003-05-19 15:33:36 +0000	[diff] [blame]	329	if ((v[1]/total) >= consistency and
				330	(delimiters is None or k in delimiters)):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	331	delims[k] = v
				332	consistency -= 0.01
				333
				334	if len(delims) == 1:
				335	delim = delims.keys()[0]
				336	skipinitialspace = (data[0].count(delim) ==
				337	data[0].count("%c " % delim))
				338	return (delim, skipinitialspace)
				339
				340	# analyze another chunkLength lines
				341	start = end
				342	end += chunkLength
				343
				344	if not delims:
				345	return ('', 0)
				346
				347	# if there's more than one, fall back to a 'preferred' list
				348	if len(delims) > 1:
				349	for d in self.preferred:
				350	if d in delims.keys():
				351	skipinitialspace = (data[0].count(d) ==
				352	data[0].count("%c " % d))
				353	return (d, skipinitialspace)
				354
				355	# finally, just return the first damn character in the list
				356	delim = delims.keys()[0]
				357	skipinitialspace = (data[0].count(delim) ==
				358	data[0].count("%c " % delim))
				359	return (delim, skipinitialspace)
				360
				361
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	362	def has_header(self, sample):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	363	# Creates a dictionary of types of data in each column. If any
				364	# column is of a single type (say, integers), except for the first
				365	# row, then the first row is presumed to be labels. If the type
				366	# can't be determined, it is assumed to be a string in which case
				367	# the length of the string is the determining factor: if all of the
				368	# rows except for the first are the same length, it's a header.
				369	# Finally, a 'vote' is taken at the end for each column, adding or
				370	# subtracting from the likelihood of the first row being a header.
				371
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	372	rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	373
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	374	header = rdr.next() # assume first row is header
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	375
				376	columns = len(header)
				377	columnTypes = {}
				378	for i in range(columns): columnTypes[i] = None
				379
				380	checked = 0
Skip Montanaro	1448d47	2003-04-25 14:47:16 +0000	[diff] [blame]	381	for row in rdr:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	382	# arbitrary number of rows to check, to keep it sane
				383	if checked > 20:
				384	break
				385	checked += 1
				386
				387	if len(row) != columns:
				388	continue # skip rows that have irregular number of columns
				389
				390	for col in columnTypes.keys():
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	391
				392	for thisType in [int, long, float, complex]:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	393	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	394	thisType(row[col])
				395	break
Raymond Hettinger	abe14e6	2003-06-12 03:59:17 +0000	[diff] [blame]	396	except (ValueError, OverflowError):
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	397	pass
				398	else:
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	399	# fallback to length of string
				400	thisType = len(row[col])
				401
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	402	# treat longs as ints
				403	if thisType == long:
				404	thisType = int
				405
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	406	if thisType != columnTypes[col]:
				407	if columnTypes[col] is None: # add new column type
				408	columnTypes[col] = thisType
				409	else:
				410	# type is inconsistent, remove column from
				411	# consideration
				412	del columnTypes[col]
				413
				414	# finally, compare results against first row and "vote"
				415	# on whether it's a header
				416	hasHeader = 0
				417	for col, colType in columnTypes.items():
				418	if type(colType) == type(0): # it's a length
				419	if len(header[col]) != colType:
				420	hasHeader += 1
				421	else:
				422	hasHeader -= 1
				423	else: # attempt typecast
				424	try:
Raymond Hettinger	39a5592	2003-06-12 03:01:55 +0000	[diff] [blame]	425	colType(header[col])
Raymond Hettinger	f31cb0c	2003-06-12 04:05:00 +0000	[diff] [blame]	426	except (ValueError, TypeError):
Skip Montanaro	04ae705	2003-04-24 20:21:31 +0000	[diff] [blame]	427	hasHeader += 1
				428	else:
				429	hasHeader -= 1
				430
				431	return hasHeader > 0