| """ |
| dialect = Sniffer().sniff(file('csv/easy.csv')) |
| print "delimiter", dialect.delimiter |
| print "quotechar", dialect.quotechar |
| print "skipinitialspace", dialect.skipinitialspace |
| """ |
| |
| from csv import csv |
| import re |
| |
| # ------------------------------------------------------------------------------ |
| class Sniffer: |
| """ |
| "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) |
| Returns a csv.Dialect object. |
| """ |
| def __init__(self, sample = 16 * 1024): |
| # in case there is more than one possible delimiter |
| self.preferred = [',', '\t', ';', ' ', ':'] |
| |
| # amount of data (in bytes) to sample |
| self.sample = sample |
| |
| |
| def sniff(self, fileobj): |
| """ |
| Takes a file-like object and returns a dialect (or None) |
| """ |
| |
| self.fileobj = fileobj |
| |
| data = fileobj.read(self.sample) |
| |
| quotechar, delimiter, skipinitialspace = self._guessQuoteAndDelimiter(data) |
| if delimiter is None: |
| delimiter, skipinitialspace = self._guessDelimiter(data) |
| |
| class Dialect(csv.Dialect): |
| _name = "sniffed" |
| lineterminator = '\r\n' |
| quoting = csv.QUOTE_MINIMAL |
| # escapechar = '' |
| doublequote = False |
| Dialect.delimiter = delimiter |
| Dialect.quotechar = quotechar |
| Dialect.skipinitialspace = skipinitialspace |
| |
| self.dialect = Dialect |
| return self.dialect |
| |
| |
| def hasHeaders(self): |
| return self._hasHeaders(self.fileobj, self.dialect) |
| |
| |
| def register_dialect(self, name = 'sniffed'): |
| csv.register_dialect(name, self.dialect) |
| |
| |
| def _guessQuoteAndDelimiter(self, data): |
| """ |
| Looks for text enclosed between two identical quotes |
| (the probable quotechar) which are preceded and followed |
| by the same character (the probable delimiter). |
| For example: |
| ,'some text', |
| The quote with the most wins, same with the delimiter. |
| If there is no quotechar the delimiter can't be determined |
| this way. |
| """ |
| |
| matches = [] |
| for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", |
| '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", |
| '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" |
| '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) |
| regexp = re.compile(restr, re.S | re.M) |
| matches = regexp.findall(data) |
| if matches: |
| break |
| |
| if not matches: |
| return ('', None, 0) # (quotechar, delimiter, skipinitialspace) |
| |
| quotes = {} |
| delims = {} |
| spaces = 0 |
| for m in matches: |
| n = regexp.groupindex['quote'] - 1 |
| key = m[n] |
| if key: |
| quotes[key] = quotes.get(key, 0) + 1 |
| try: |
| n = regexp.groupindex['delim'] - 1 |
| key = m[n] |
| except KeyError: |
| continue |
| if key: |
| delims[key] = delims.get(key, 0) + 1 |
| try: |
| n = regexp.groupindex['space'] - 1 |
| except KeyError: |
| continue |
| if m[n]: |
| spaces += 1 |
| |
| quotechar = reduce(lambda a, b, quotes = quotes: |
| (quotes[a] > quotes[b]) and a or b, quotes.keys()) |
| |
| if delims: |
| delim = reduce(lambda a, b, delims = delims: |
| (delims[a] > delims[b]) and a or b, delims.keys()) |
| skipinitialspace = delims[delim] == spaces |
| if delim == '\n': # most likely a file with a single column |
| delim = '' |
| else: |
| # there is *no* delimiter, it's a single column of quoted data |
| delim = '' |
| skipinitialspace = 0 |
| |
| return (quotechar, delim, skipinitialspace) |
| |
| |
| def _guessDelimiter(self, data): |
| """ |
| The delimiter /should/ occur the same number of times on |
| each row. However, due to malformed data, it may not. We don't want |
| an all or nothing approach, so we allow for small variations in this |
| number. |
| 1) build a table of the frequency of each character on every line. |
| 2) build a table of freqencies of this frequency (meta-frequency?), |
| e.g. "x occurred 5 times in 10 rows, 6 times in 1000 rows, |
| 7 times in 2 rows" |
| 3) use the mode of the meta-frequency to determine the /expected/ |
| frequency for that character |
| 4) find out how often the character actually meets that goal |
| 5) the character that best meets its goal is the delimiter |
| For performance reasons, the data is evaluated in chunks, so it can |
| try and evaluate the smallest portion of the data possible, evaluating |
| additional chunks as necessary. |
| """ |
| |
| data = filter(None, data.split('\n')) |
| |
| ascii = [chr(c) for c in range(127)] # 7-bit ASCII |
| |
| # build frequency tables |
| chunkLength = min(10, len(data)) |
| iteration = 0 |
| charFrequency = {} |
| modes = {} |
| delims = {} |
| start, end = 0, min(chunkLength, len(data)) |
| while start < len(data): |
| iteration += 1 |
| for line in data[start:end]: |
| for char in ascii: |
| metafrequency = charFrequency.get(char, {}) |
| freq = line.strip().count(char) # must count even if frequency is 0 |
| metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode |
| charFrequency[char] = metafrequency |
| |
| for char in charFrequency.keys(): |
| items = charFrequency[char].items() |
| if len(items) == 1 and items[0][0] == 0: |
| continue |
| # get the mode of the frequencies |
| if len(items) > 1: |
| modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items) |
| # adjust the mode - subtract the sum of all other frequencies |
| items.remove(modes[char]) |
| modes[char] = (modes[char][0], modes[char][1] |
| - reduce(lambda a, b: (0, a[1] + b[1]), items)[1]) |
| else: |
| modes[char] = items[0] |
| |
| # build a list of possible delimiters |
| modeList = modes.items() |
| total = float(chunkLength * iteration) |
| consistency = 1.0 # (rows of consistent data) / (number of rows) = 100% |
| threshold = 0.9 # minimum consistency threshold |
| while len(delims) == 0 and consistency >= threshold: |
| for k, v in modeList: |
| if v[0] > 0 and v[1] > 0: |
| if (v[1]/total) >= consistency: |
| delims[k] = v |
| consistency -= 0.01 |
| |
| if len(delims) == 1: |
| delim = delims.keys()[0] |
| skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim) |
| return (delim, skipinitialspace) |
| |
| # analyze another chunkLength lines |
| start = end |
| end += chunkLength |
| |
| if not delims: |
| return ('', 0) |
| |
| # if there's more than one, fall back to a 'preferred' list |
| if len(delims) > 1: |
| for d in self.preferred: |
| if d in delims.keys(): |
| skipinitialspace = data[0].count(d) == data[0].count("%c " % d) |
| return (d, skipinitialspace) |
| |
| # finally, just return the first damn character in the list |
| delim = delims.keys()[0] |
| skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim) |
| return (delim, skipinitialspace) |
| |
| |
| def _hasHeaders(self, fileobj, dialect): |
| # Creates a dictionary of types of data in each column. If any column |
| # is of a single type (say, integers), *except* for the first row, then the first |
| # row is presumed to be labels. If the type can't be determined, it is assumed to |
| # be a string in which case the length of the string is the determining factor: if |
| # all of the rows except for the first are the same length, it's a header. |
| # Finally, a 'vote' is taken at the end for each column, adding or subtracting from |
| # the likelihood of the first row being a header. |
| |
| def seval(item): |
| """ |
| Strips parens from item prior to calling eval in an attempt to make it safer |
| """ |
| return eval(item.replace('(', '').replace(')', '')) |
| |
| fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects... |
| |
| reader = csv.reader(fileobj, |
| delimiter = dialect.delimiter, |
| quotechar = dialect.quotechar, |
| skipinitialspace = dialect.skipinitialspace) |
| |
| header = reader.next() # assume first row is header |
| |
| columns = len(header) |
| columnTypes = {} |
| for i in range(columns): columnTypes[i] = None |
| |
| checked = 0 |
| for row in reader: |
| if checked > 20: # arbitrary number of rows to check, to keep it sane |
| break |
| checked += 1 |
| |
| if len(row) != columns: |
| continue # skip rows that have irregular number of columns |
| |
| for col in columnTypes.keys(): |
| try: |
| try: |
| # is it a built-in type (besides string)? |
| thisType = type(seval(row[col])) |
| except OverflowError: |
| # a long int? |
| thisType = type(seval(row[col] + 'L')) |
| thisType = type(0) # treat long ints as int |
| except: |
| # fallback to length of string |
| thisType = len(row[col]) |
| |
| if thisType != columnTypes[col]: |
| if columnTypes[col] is None: # add new column type |
| columnTypes[col] = thisType |
| else: # type is inconsistent, remove column from consideration |
| del columnTypes[col] |
| |
| # finally, compare results against first row and "vote" on whether it's a header |
| hasHeader = 0 |
| for col, colType in columnTypes.items(): |
| if type(colType) == type(0): # it's a length |
| if len(header[col]) != colType: |
| hasHeader += 1 |
| else: |
| hasHeader -= 1 |
| else: # attempt typecast |
| try: |
| eval("%s(%s)" % (colType.__name__, header[col])) |
| except: |
| hasHeader += 1 |
| else: |
| hasHeader -= 1 |
| |
| return hasHeader > 0 |
| |
| |
| |