blob: 7b9b060d25a12ce6b033dfa8c0093bf6191a6075 [file] [log] [blame]
Skip Montanarof823f112003-03-20 23:31:24 +00001"""
2dialect = Sniffer().sniff(file('csv/easy.csv'))
3print "delimiter", dialect.delimiter
4print "quotechar", dialect.quotechar
5print "skipinitialspace", dialect.skipinitialspace
6"""
7
8from csv import csv
9import re
10
11# ------------------------------------------------------------------------------
12class Sniffer:
13 """
14 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
15 Returns a csv.Dialect object.
16 """
17 def __init__(self, sample = 16 * 1024):
18 # in case there is more than one possible delimiter
19 self.preferred = [',', '\t', ';', ' ', ':']
20
21 # amount of data (in bytes) to sample
22 self.sample = sample
23
24
25 def sniff(self, fileobj):
26 """
27 Takes a file-like object and returns a dialect (or None)
28 """
29
30 self.fileobj = fileobj
31
32 data = fileobj.read(self.sample)
33
34 quotechar, delimiter, skipinitialspace = self._guessQuoteAndDelimiter(data)
35 if delimiter is None:
36 delimiter, skipinitialspace = self._guessDelimiter(data)
37
38 class Dialect(csv.Dialect):
39 _name = "sniffed"
40 lineterminator = '\r\n'
41 quoting = csv.QUOTE_MINIMAL
42 # escapechar = ''
43 doublequote = False
44 Dialect.delimiter = delimiter
45 Dialect.quotechar = quotechar
46 Dialect.skipinitialspace = skipinitialspace
47
48 self.dialect = Dialect
49 return self.dialect
50
51
52 def hasHeaders(self):
53 return self._hasHeaders(self.fileobj, self.dialect)
54
55
56 def register_dialect(self, name = 'sniffed'):
57 csv.register_dialect(name, self.dialect)
58
59
60 def _guessQuoteAndDelimiter(self, data):
61 """
62 Looks for text enclosed between two identical quotes
63 (the probable quotechar) which are preceded and followed
64 by the same character (the probable delimiter).
65 For example:
66 ,'some text',
67 The quote with the most wins, same with the delimiter.
68 If there is no quotechar the delimiter can't be determined
69 this way.
70 """
71
72 matches = []
73 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
74 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
75 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
76 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
77 regexp = re.compile(restr, re.S | re.M)
78 matches = regexp.findall(data)
79 if matches:
80 break
81
82 if not matches:
83 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
84
85 quotes = {}
86 delims = {}
87 spaces = 0
88 for m in matches:
89 n = regexp.groupindex['quote'] - 1
90 key = m[n]
91 if key:
92 quotes[key] = quotes.get(key, 0) + 1
93 try:
94 n = regexp.groupindex['delim'] - 1
95 key = m[n]
96 except KeyError:
97 continue
98 if key:
99 delims[key] = delims.get(key, 0) + 1
100 try:
101 n = regexp.groupindex['space'] - 1
102 except KeyError:
103 continue
104 if m[n]:
105 spaces += 1
106
107 quotechar = reduce(lambda a, b, quotes = quotes:
108 (quotes[a] > quotes[b]) and a or b, quotes.keys())
109
110 if delims:
111 delim = reduce(lambda a, b, delims = delims:
112 (delims[a] > delims[b]) and a or b, delims.keys())
113 skipinitialspace = delims[delim] == spaces
114 if delim == '\n': # most likely a file with a single column
115 delim = ''
116 else:
117 # there is *no* delimiter, it's a single column of quoted data
118 delim = ''
119 skipinitialspace = 0
120
121 return (quotechar, delim, skipinitialspace)
122
123
124 def _guessDelimiter(self, data):
125 """
126 The delimiter /should/ occur the same number of times on
127 each row. However, due to malformed data, it may not. We don't want
128 an all or nothing approach, so we allow for small variations in this
129 number.
130 1) build a table of the frequency of each character on every line.
131 2) build a table of freqencies of this frequency (meta-frequency?),
132 e.g. "x occurred 5 times in 10 rows, 6 times in 1000 rows,
133 7 times in 2 rows"
134 3) use the mode of the meta-frequency to determine the /expected/
135 frequency for that character
136 4) find out how often the character actually meets that goal
137 5) the character that best meets its goal is the delimiter
138 For performance reasons, the data is evaluated in chunks, so it can
139 try and evaluate the smallest portion of the data possible, evaluating
140 additional chunks as necessary.
141 """
142
143 data = filter(None, data.split('\n'))
144
145 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
146
147 # build frequency tables
148 chunkLength = min(10, len(data))
149 iteration = 0
150 charFrequency = {}
151 modes = {}
152 delims = {}
153 start, end = 0, min(chunkLength, len(data))
154 while start < len(data):
155 iteration += 1
156 for line in data[start:end]:
157 for char in ascii:
158 metafrequency = charFrequency.get(char, {})
159 freq = line.strip().count(char) # must count even if frequency is 0
160 metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode
161 charFrequency[char] = metafrequency
162
163 for char in charFrequency.keys():
164 items = charFrequency[char].items()
165 if len(items) == 1 and items[0][0] == 0:
166 continue
167 # get the mode of the frequencies
168 if len(items) > 1:
169 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items)
170 # adjust the mode - subtract the sum of all other frequencies
171 items.remove(modes[char])
172 modes[char] = (modes[char][0], modes[char][1]
173 - reduce(lambda a, b: (0, a[1] + b[1]), items)[1])
174 else:
175 modes[char] = items[0]
176
177 # build a list of possible delimiters
178 modeList = modes.items()
179 total = float(chunkLength * iteration)
180 consistency = 1.0 # (rows of consistent data) / (number of rows) = 100%
181 threshold = 0.9 # minimum consistency threshold
182 while len(delims) == 0 and consistency >= threshold:
183 for k, v in modeList:
184 if v[0] > 0 and v[1] > 0:
185 if (v[1]/total) >= consistency:
186 delims[k] = v
187 consistency -= 0.01
188
189 if len(delims) == 1:
190 delim = delims.keys()[0]
191 skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
192 return (delim, skipinitialspace)
193
194 # analyze another chunkLength lines
195 start = end
196 end += chunkLength
197
198 if not delims:
199 return ('', 0)
200
201 # if there's more than one, fall back to a 'preferred' list
202 if len(delims) > 1:
203 for d in self.preferred:
204 if d in delims.keys():
205 skipinitialspace = data[0].count(d) == data[0].count("%c " % d)
206 return (d, skipinitialspace)
207
208 # finally, just return the first damn character in the list
209 delim = delims.keys()[0]
210 skipinitialspace = data[0].count(delim) == data[0].count("%c " % delim)
211 return (delim, skipinitialspace)
212
213
214 def _hasHeaders(self, fileobj, dialect):
215 # Creates a dictionary of types of data in each column. If any column
216 # is of a single type (say, integers), *except* for the first row, then the first
217 # row is presumed to be labels. If the type can't be determined, it is assumed to
218 # be a string in which case the length of the string is the determining factor: if
219 # all of the rows except for the first are the same length, it's a header.
220 # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
221 # the likelihood of the first row being a header.
222
223 def seval(item):
224 """
225 Strips parens from item prior to calling eval in an attempt to make it safer
226 """
227 return eval(item.replace('(', '').replace(')', ''))
228
229 fileobj.seek(0) # rewind the fileobj - this might not work for some file-like objects...
230
231 reader = csv.reader(fileobj,
232 delimiter = dialect.delimiter,
233 quotechar = dialect.quotechar,
234 skipinitialspace = dialect.skipinitialspace)
235
236 header = reader.next() # assume first row is header
237
238 columns = len(header)
239 columnTypes = {}
240 for i in range(columns): columnTypes[i] = None
241
242 checked = 0
243 for row in reader:
244 if checked > 20: # arbitrary number of rows to check, to keep it sane
245 break
246 checked += 1
247
248 if len(row) != columns:
249 continue # skip rows that have irregular number of columns
250
251 for col in columnTypes.keys():
252 try:
253 try:
254 # is it a built-in type (besides string)?
255 thisType = type(seval(row[col]))
256 except OverflowError:
257 # a long int?
258 thisType = type(seval(row[col] + 'L'))
259 thisType = type(0) # treat long ints as int
260 except:
261 # fallback to length of string
262 thisType = len(row[col])
263
264 if thisType != columnTypes[col]:
265 if columnTypes[col] is None: # add new column type
266 columnTypes[col] = thisType
267 else: # type is inconsistent, remove column from consideration
268 del columnTypes[col]
269
270 # finally, compare results against first row and "vote" on whether it's a header
271 hasHeader = 0
272 for col, colType in columnTypes.items():
273 if type(colType) == type(0): # it's a length
274 if len(header[col]) != colType:
275 hasHeader += 1
276 else:
277 hasHeader -= 1
278 else: # attempt typecast
279 try:
280 eval("%s(%s)" % (colType.__name__, header[col]))
281 except:
282 hasHeader += 1
283 else:
284 hasHeader -= 1
285
286 return hasHeader > 0
287
288
289