Blame - Lib/csv/util/sniffer.py - platform/external/python/cpython3

blob: 7b9b060d25a12ce6b033dfa8c0093bf6191a6075 [file] [log] [blame]

Skip Montanaro	f823f11	2003-03-20 23:31:24 +0000	[diff] [blame]	1	"""
				2	dialect = Sniffer().sniff(file('csv/easy.csv'))
				3	print "delimiter", dialect.delimiter
				4	print "quotechar", dialect.quotechar
				5	print "skipinitialspace", dialect.skipinitialspace
				6	"""
				7
				8	from csv import csv
				9	import re
				10
				11	# ------------------------------------------------------------------------------
				12	class Sniffer:
				13	"""
				14	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
				15	Returns a csv.Dialect object.
				16	"""
				17	def __init__(self, sample = 16 * 1024):
				18	# in case there is more than one possible delimiter
				19	self.preferred = [',', '\t', ';', ' ', ':']
				20
				21	# amount of data (in bytes) to sample
				22	self.sample = sample
				23
				24
				25	def sniff(self, fileobj):
				26	"""
				27	Takes a file-like object and returns a dialect (or None)
				28	"""
				29
				30	self.fileobj = fileobj
				31
				32	data = fileobj.read(self.sample)
				33
				34	quotechar, delimiter, skipinitialspace = self._guessQuoteAndDelimiter(data)
				35	if delimiter is None:
				36	delimiter, skipinitialspace = self._guessDelimiter(data)
				37
				38	class Dialect(csv.Dialect):
				39	_name = "sniffed"
				40	lineterminator = '\r\n'
				41	quoting = csv.QUOTE_MINIMAL
				42	# escapechar = ''
				43	doublequote = False
				44	Dialect.delimiter = delimiter
				45	Dialect.quotechar = quotechar
				46	Dialect.skipinitialspace = skipinitialspace
				47
				48	self.dialect = Dialect
				49	return self.dialect
				50
				51
				52	def hasHeaders(self):
				53	return self._hasHeaders(self.fileobj, self.dialect)
				54
				55
				56	def register_dialect(self, name = 'sniffed'):
				57	csv.register_dialect(name, self.dialect)
				58
				59
				60	def _guessQuoteAndDelimiter(self, data):
				61	"""
				62	Looks for text enclosed between two identical quotes
				63	(the probable quotechar) which are preceded and followed
				64	by the same character (the probable delimiter).
				65	For example:
				66	,'some text',
				67	The quote with the most wins, same with the delimiter.
				68	If there is no quotechar the delimiter can't be determined
				69	this way.
				70	"""
				71
				72	matches = []
				73	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
				74	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
				75	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
				76	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
				77	regexp = re.compile(restr, re.S \| re.M)
				78	matches = regexp.findall(data)
				79	if matches:
				80	break
				81
				82	if not matches:
				83	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
				84
				85	quotes = {}
				86	delims = {}
				87	spaces = 0
				88	for m in matches:
				89	n = regexp.groupindex['quote'] - 1
				90	key = m[n]
				91	if key:
				92	quotes[key] = quotes.get(key, 0) + 1
				93	try:
				94	n = regexp.groupindex['delim'] - 1
				95	key = m[n]
				96	except KeyError:
				97	continue
				98	if key:
				99	delims[key] = delims.get(key, 0) + 1
				100	try:
				101	n = regexp.groupindex['space'] - 1
				102	except KeyError:
				103	continue
				104	if m[n]:
				105	spaces += 1
				106
				107	quotechar = reduce(lambda a, b, quotes = quotes:
				108	(quotes[a] > quotes[b]) and a or b, quotes.keys())
				109
				110	if delims:
				111	delim = reduce(lambda a, b, delims = delims:
				112	(delims[a] > delims[b]) and a or b, delims.keys())
				113	skipinitialspace = delims[delim] == spaces
				114	if delim == '\n': # most likely a file with a single column
				115	delim = ''
				116	else:
				117	# there is no delimiter, it's a single column of quoted data
				118	delim = ''
				119	skipinitialspace = 0
				120
				121	return (quotechar, delim, skipinitialspace)
				122
				123
				124	def _guessDelimiter(self, data):
				125	"""
				126	The delimiter /should/ occur the same number of times on
				127	each row. However, due to malformed data, it may not. We don't want
				128	an all or nothing approach, so we allow for small variations in this
				129	number.
				130	1) build a table of the frequency of each character on every line.
				131	2) build a table of freqencies of this frequency (meta-frequency?),
				132	e.g. "x occurred 5 times in 10 rows, 6 times in 1000 rows,
				133	7 times in 2 rows"
				134	3) use the mode of the meta-frequency to determine the /expected/
				135	frequency for that character
				136	4) find out how often the character actually meets that goal
				137	5) the character that best meets its goal is the delimiter
				138	For performance reasons, the data is evaluated in chunks, so it can
				139	try and evaluate the smallest portion of the data possible, evaluating
				140	additional chunks as necessary.
				141	"""
				142
				143	data = filter(None, data.split('\n'))
				144
				145	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
				146
				147	# build frequency tables
				148	chunkLength = min(10, len(data))
				149	iteration = 0
				150	charFrequency = {}
				151	modes = {}
				152	delims = {}
				153	start, end = 0, min(chunkLength, len(data))
				154	while start < len(data):
				155	iteration += 1
				156	for line in data[start:end]:
				157	for char in ascii:
				158	metafrequency = charFrequency.get(char, {})
				159	freq = line.strip().count(char) # must count even if frequency is 0
				160	metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode
				161	charFrequency[char] = metafrequency
				162
				163	for char in charFrequency.keys():
				164	items = charFrequency[char].items()
				165	if len(items) == 1 and items[0][0] == 0:
				166	continue
				167	# get the mode of the frequencies
				168	if len(items) > 1:
				169	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items)
				170	# adjust the mode - subtract the sum of all other frequencies
				171	items.remove(modes[char])
				172	modes[char] = (modes[char][0], modes[char][1]
				173	- reduce(lambda a, b: (0, a[1] + b[1]), items)[1])
				174	else:
				175	modes[char] = items[0]
				176
				177	# build a list of possible delimiters
				178	modeList = modes.items()
				179	total = float(chunkLength * iteration)
				180	consistency = 1.0 # (rows of consistent data) / (number of rows) = 100%
				181	threshold = 0.9 # minimum consistency threshold
				182	while len(delims) == 0 and consistency >= threshold:
				183	for k, v in modeList:
				184	if v[0] > 0 and v[1] > 0:
				185	if (v[1]/total) >= consistency:
				186	delims[k] = v
				187	consistency -= 0.01
				188
				189	if len(delims) == 1:
				190	delim = delims.keys()[0]
				191	skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
				192	return (delim, skipinitialspace)
				193
				194	# analyze another chunkLength lines
				195	start = end
				196	end += chunkLength
				197
				198	if not delims:
				199	return ('', 0)
				200
				201	# if there's more than one, fall back to a 'preferred' list
				202	if len(delims) > 1:
				203	for d in self.preferred:
				204	if d in delims.keys():
				205	skipinitialspace = data[0].count(d) == data[0].count("%c " % d)
				206	return (d, skipinitialspace)
				207
				208	# finally, just return the first damn character in the list
				209	delim = delims.keys()[0]
				210	skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
				211	return (delim, skipinitialspace)
				212
				213
				214	def _hasHeaders(self, fileobj, dialect):
				215	# Creates a dictionary of types of data in each column. If any column
				216	# is of a single type (say, integers), except for the first row, then the first
				217	# row is presumed to be labels. If the type can't be determined, it is assumed to
				218	# be a string in which case the length of the string is the determining factor: if
				219	# all of the rows except for the first are the same length, it's a header.
				220	# Finally, a 'vote' is taken at the end for each column, adding or subtracting from
				221	# the likelihood of the first row being a header.
				222
				223	def seval(item):
				224	"""
				225	Strips parens from item prior to calling eval in an attempt to make it safer
				226	"""
				227	return eval(item.replace('(', '').replace(')', ''))
				228
				229	fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects...
				230
				231	reader = csv.reader(fileobj,
				232	delimiter = dialect.delimiter,
				233	quotechar = dialect.quotechar,
				234	skipinitialspace = dialect.skipinitialspace)
				235
				236	header = reader.next() # assume first row is header
				237
				238	columns = len(header)
				239	columnTypes = {}
				240	for i in range(columns): columnTypes[i] = None
				241
				242	checked = 0
				243	for row in reader:
				244	if checked > 20: # arbitrary number of rows to check, to keep it sane
				245	break
				246	checked += 1
				247
				248	if len(row) != columns:
				249	continue # skip rows that have irregular number of columns
				250
				251	for col in columnTypes.keys():
				252	try:
				253	try:
				254	# is it a built-in type (besides string)?
				255	thisType = type(seval(row[col]))
				256	except OverflowError:
				257	# a long int?
				258	thisType = type(seval(row[col] + 'L'))
				259	thisType = type(0) # treat long ints as int
				260	except:
				261	# fallback to length of string
				262	thisType = len(row[col])
				263
				264	if thisType != columnTypes[col]:
				265	if columnTypes[col] is None: # add new column type
				266	columnTypes[col] = thisType
				267	else: # type is inconsistent, remove column from consideration
				268	del columnTypes[col]
				269
				270	# finally, compare results against first row and "vote" on whether it's a header
				271	hasHeader = 0
				272	for col, colType in columnTypes.items():
				273	if type(colType) == type(0): # it's a length
				274	if len(header[col]) != colType:
				275	hasHeader += 1
				276	else:
				277	hasHeader -= 1
				278	else: # attempt typecast
				279	try:
				280	eval("%s(%s)" % (colType.__name__, header[col]))
				281	except:
				282	hasHeader += 1
				283	else:
				284	hasHeader -= 1
				285
				286	return hasHeader > 0
				287
				288
				289