Blame - python/lib/Lib/csv.py - platform/tools/idea

blob: c8e84813965c17318adc283b8cfa8876394562b8 [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame^]	1
				2	"""
				3	csv.py - read/write/investigate CSV files
				4	"""
				5
				6	import re
				7	from _csv import Error, __version__, writer, reader, register_dialect, \
				8	unregister_dialect, get_dialect, list_dialects, \
				9	field_size_limit, \
				10	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
				11	__doc__
				12	from _csv import Dialect as _Dialect
				13
				14	try:
				15	from cStringIO import StringIO
				16	except ImportError:
				17	from StringIO import StringIO
				18
				19	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
				20	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
				21	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
				22	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
				23
				24	class Dialect:
				25	"""Describe an Excel dialect.
				26
				27	This must be subclassed (see csv.excel). Valid attributes are:
				28	delimiter, quotechar, escapechar, doublequote, skipinitialspace,
				29	lineterminator, quoting.
				30
				31	"""
				32	_name = ""
				33	_valid = False
				34	# placeholders
				35	delimiter = None
				36	quotechar = None
				37	escapechar = None
				38	doublequote = None
				39	skipinitialspace = None
				40	lineterminator = None
				41	quoting = None
				42
				43	def __init__(self):
				44	if self.__class__ != Dialect:
				45	self._valid = True
				46	self._validate()
				47
				48	def _validate(self):
				49	try:
				50	_Dialect(self)
				51	except TypeError, e:
				52	# We do this for compatibility with py2.3
				53	raise Error(str(e))
				54
				55	class excel(Dialect):
				56	"""Describe the usual properties of Excel-generated CSV files."""
				57	delimiter = ','
				58	quotechar = '"'
				59	doublequote = True
				60	skipinitialspace = False
				61	lineterminator = '\r\n'
				62	quoting = QUOTE_MINIMAL
				63	register_dialect("excel", excel)
				64
				65	class excel_tab(excel):
				66	"""Describe the usual properties of Excel-generated TAB-delimited files."""
				67	delimiter = '\t'
				68	register_dialect("excel-tab", excel_tab)
				69
				70
				71	class DictReader:
				72	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
				73	dialect="excel", args, *kwds):
				74	self.fieldnames = fieldnames # list of keys for the dict
				75	self.restkey = restkey # key to catch long rows
				76	self.restval = restval # default value for short rows
				77	self.reader = reader(f, dialect, args, *kwds)
				78	self.dialect = dialect
				79	self.line_num = 0
				80
				81	def __iter__(self):
				82	return self
				83
				84	def next(self):
				85	row = self.reader.next()
				86	if self.fieldnames is None:
				87	self.fieldnames = row
				88	row = self.reader.next()
				89	self.line_num = self.reader.line_num
				90
				91	# unlike the basic reader, we prefer not to return blanks,
				92	# because we will typically wind up with a dict full of None
				93	# values
				94	while row == []:
				95	row = self.reader.next()
				96	d = dict(zip(self.fieldnames, row))
				97	lf = len(self.fieldnames)
				98	lr = len(row)
				99	if lf < lr:
				100	d[self.restkey] = row[lf:]
				101	elif lf > lr:
				102	for key in self.fieldnames[lr:]:
				103	d[key] = self.restval
				104	return d
				105
				106
				107	class DictWriter:
				108	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
				109	dialect="excel", args, *kwds):
				110	self.fieldnames = fieldnames # list of keys for the dict
				111	self.restval = restval # for writing short dicts
				112	if extrasaction.lower() not in ("raise", "ignore"):
				113	raise ValueError, \
				114	("extrasaction (%s) must be 'raise' or 'ignore'" %
				115	extrasaction)
				116	self.extrasaction = extrasaction
				117	self.writer = writer(f, dialect, args, *kwds)
				118
				119	def _dict_to_list(self, rowdict):
				120	if self.extrasaction == "raise":
				121	for k in rowdict.keys():
				122	if k not in self.fieldnames:
				123	raise ValueError, "dict contains fields not in fieldnames"
				124	return [rowdict.get(key, self.restval) for key in self.fieldnames]
				125
				126	def writerow(self, rowdict):
				127	return self.writer.writerow(self._dict_to_list(rowdict))
				128
				129	def writerows(self, rowdicts):
				130	rows = []
				131	for rowdict in rowdicts:
				132	rows.append(self._dict_to_list(rowdict))
				133	return self.writer.writerows(rows)
				134
				135	# Guard Sniffer's type checking against builds that exclude complex()
				136	try:
				137	complex
				138	except NameError:
				139	complex = float
				140
				141	class Sniffer:
				142	'''
				143	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
				144	Returns a Dialect object.
				145	'''
				146	def __init__(self):
				147	# in case there is more than one possible delimiter
				148	self.preferred = [',', '\t', ';', ' ', ':']
				149
				150
				151	def sniff(self, sample, delimiters=None):
				152	"""
				153	Returns a dialect (or None) corresponding to the sample
				154	"""
				155
				156	quotechar, delimiter, skipinitialspace = \
				157	self._guess_quote_and_delimiter(sample, delimiters)
				158	if not delimiter:
				159	delimiter, skipinitialspace = self._guess_delimiter(sample,
				160	delimiters)
				161
				162	if not delimiter:
				163	raise Error, "Could not determine delimiter"
				164
				165	class dialect(Dialect):
				166	_name = "sniffed"
				167	lineterminator = '\r\n'
				168	quoting = QUOTE_MINIMAL
				169	# escapechar = ''
				170	doublequote = False
				171
				172	dialect.delimiter = delimiter
				173	# _csv.reader won't accept a quotechar of ''
				174	dialect.quotechar = quotechar or '"'
				175	dialect.skipinitialspace = skipinitialspace
				176
				177	return dialect
				178
				179
				180	def _guess_quote_and_delimiter(self, data, delimiters):
				181	"""
				182	Looks for text enclosed between two identical quotes
				183	(the probable quotechar) which are preceded and followed
				184	by the same character (the probable delimiter).
				185	For example:
				186	,'some text',
				187	The quote with the most wins, same with the delimiter.
				188	If there is no quotechar the delimiter can't be determined
				189	this way.
				190	"""
				191
				192	matches = []
				193	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				194	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				195	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				196	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
				197	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
				198	matches = regexp.findall(data)
				199	if matches:
				200	break
				201
				202	if not matches:
				203	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				204
				205	quotes = {}
				206	delims = {}
				207	spaces = 0
				208	for m in matches:
				209	n = regexp.groupindex['quote'] - 1
				210	key = m[n]
				211	if key:
				212	quotes[key] = quotes.get(key, 0) + 1
				213	try:
				214	n = regexp.groupindex['delim'] - 1
				215	key = m[n]
				216	except KeyError:
				217	continue
				218	if key and (delimiters is None or key in delimiters):
				219	delims[key] = delims.get(key, 0) + 1
				220	try:
				221	n = regexp.groupindex['space'] - 1
				222	except KeyError:
				223	continue
				224	if m[n]:
				225	spaces += 1
				226
				227	quotechar = reduce(lambda a, b, quotes = quotes:
				228	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				229
				230	if delims:
				231	delim = reduce(lambda a, b, delims = delims:
				232	(delims[a] > delims[b]) and a or b, delims.keys())
				233	skipinitialspace = delims[delim] == spaces
				234	if delim == '\n': # most likely a file with a single column
				235	delim = ''
				236	else:
				237	# there is no delimiter, it's a single column of quoted data
				238	delim = ''
				239	skipinitialspace = 0
				240
				241	return (quotechar, delim, skipinitialspace)
				242
				243
				244	def _guess_delimiter(self, data, delimiters):
				245	"""
				246	The delimiter /should/ occur the same number of times on
				247	each row. However, due to malformed data, it may not. We don't want
				248	an all or nothing approach, so we allow for small variations in this
				249	number.
				250	1) build a table of the frequency of each character on every line.
				251	2) build a table of freqencies of this frequency (meta-frequency?),
				252	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
				253	7 times in 2 rows'
				254	3) use the mode of the meta-frequency to determine the /expected/
				255	frequency for that character
				256	4) find out how often the character actually meets that goal
				257	5) the character that best meets its goal is the delimiter
				258	For performance reasons, the data is evaluated in chunks, so it can
				259	try and evaluate the smallest portion of the data possible, evaluating
				260	additional chunks as necessary.
				261	"""
				262
				263	data = filter(None, data.split('\n'))
				264
				265	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				266
				267	# build frequency tables
				268	chunkLength = min(10, len(data))
				269	iteration = 0
				270	charFrequency = {}
				271	modes = {}
				272	delims = {}
				273	start, end = 0, min(chunkLength, len(data))
				274	while start < len(data):
				275	iteration += 1
				276	for line in data[start:end]:
				277	for char in ascii:
				278	metaFrequency = charFrequency.get(char, {})
				279	# must count even if frequency is 0
				280	freq = line.count(char)
				281	# value is the mode
				282	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
				283	charFrequency[char] = metaFrequency
				284
				285	for char in charFrequency.keys():
				286	items = charFrequency[char].items()
				287	if len(items) == 1 and items[0][0] == 0:
				288	continue
				289	# get the mode of the frequencies
				290	if len(items) > 1:
				291	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
				292	items)
				293	# adjust the mode - subtract the sum of all
				294	# other frequencies
				295	items.remove(modes[char])
				296	modes[char] = (modes[char][0], modes[char][1]
				297	- reduce(lambda a, b: (0, a[1] + b[1]),
				298	items)[1])
				299	else:
				300	modes[char] = items[0]
				301
				302	# build a list of possible delimiters
				303	modeList = modes.items()
				304	total = float(chunkLength * iteration)
				305	# (rows of consistent data) / (number of rows) = 100%
				306	consistency = 1.0
				307	# minimum consistency threshold
				308	threshold = 0.9
				309	while len(delims) == 0 and consistency >= threshold:
				310	for k, v in modeList:
				311	if v[0] > 0 and v[1] > 0:
				312	if ((v[1]/total) >= consistency and
				313	(delimiters is None or k in delimiters)):
				314	delims[k] = v
				315	consistency -= 0.01
				316
				317	if len(delims) == 1:
				318	delim = delims.keys()[0]
				319	skipinitialspace = (data[0].count(delim) ==
				320	data[0].count("%c " % delim))
				321	return (delim, skipinitialspace)
				322
				323	# analyze another chunkLength lines
				324	start = end
				325	end += chunkLength
				326
				327	if not delims:
				328	return ('', 0)
				329
				330	# if there's more than one, fall back to a 'preferred' list
				331	if len(delims) > 1:
				332	for d in self.preferred:
				333	if d in delims.keys():
				334	skipinitialspace = (data[0].count(d) ==
				335	data[0].count("%c " % d))
				336	return (d, skipinitialspace)
				337
				338	# nothing else indicates a preference, pick the character that
				339	# dominates(?)
				340	items = [(v,k) for (k,v) in delims.items()]
				341	items.sort()
				342	delim = items[-1][1]
				343
				344	skipinitialspace = (data[0].count(delim) ==
				345	data[0].count("%c " % delim))
				346	return (delim, skipinitialspace)
				347
				348
				349	def has_header(self, sample):
				350	# Creates a dictionary of types of data in each column. If any
				351	# column is of a single type (say, integers), except for the first
				352	# row, then the first row is presumed to be labels. If the type
				353	# can't be determined, it is assumed to be a string in which case
				354	# the length of the string is the determining factor: if all of the
				355	# rows except for the first are the same length, it's a header.
				356	# Finally, a 'vote' is taken at the end for each column, adding or
				357	# subtracting from the likelihood of the first row being a header.
				358
				359	rdr = reader(StringIO(sample), self.sniff(sample))
				360
				361	header = rdr.next() # assume first row is header
				362
				363	columns = len(header)
				364	columnTypes = {}
				365	for i in range(columns): columnTypes[i] = None
				366
				367	checked = 0
				368	for row in rdr:
				369	# arbitrary number of rows to check, to keep it sane
				370	if checked > 20:
				371	break
				372	checked += 1
				373
				374	if len(row) != columns:
				375	continue # skip rows that have irregular number of columns
				376
				377	for col in columnTypes.keys():
				378
				379	for thisType in [int, long, float, complex]:
				380	try:
				381	thisType(row[col])
				382	break
				383	except (ValueError, OverflowError):
				384	pass
				385	else:
				386	# fallback to length of string
				387	thisType = len(row[col])
				388
				389	# treat longs as ints
				390	if thisType == long:
				391	thisType = int
				392
				393	if thisType != columnTypes[col]:
				394	if columnTypes[col] is None: # add new column type
				395	columnTypes[col] = thisType
				396	else:
				397	# type is inconsistent, remove column from
				398	# consideration
				399	del columnTypes[col]
				400
				401	# finally, compare results against first row and "vote"
				402	# on whether it's a header
				403	hasHeader = 0
				404	for col, colType in columnTypes.items():
				405	if type(colType) == type(0): # it's a length
				406	if len(header[col]) != colType:
				407	hasHeader += 1
				408	else:
				409	hasHeader -= 1
				410	else: # attempt typecast
				411	try:
				412	colType(header[col])
				413	except (ValueError, TypeError):
				414	hasHeader += 1
				415	else:
				416	hasHeader -= 1
				417
				418	return hasHeader > 0