blob: 89d86d6ce02ebbb13bbdfdfc7f61523924eda95a [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
12__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
13 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
14 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
15 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
16
17class Dialect:
18 _name = ""
19 _valid = False
20 # placeholders
21 delimiter = None
22 quotechar = None
23 escapechar = None
24 doublequote = None
25 skipinitialspace = None
26 lineterminator = None
27 quoting = None
28
29 def __init__(self):
30 if self.__class__ != Dialect:
31 self._valid = True
32 errors = self._validate()
33 if errors != []:
34 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
35
36 def _validate(self):
37 errors = []
38 if not self._valid:
39 errors.append("can't directly instantiate Dialect class")
40
41 if self.delimiter is None:
42 errors.append("delimiter character not set")
43 elif (not isinstance(self.delimiter, str) or
44 len(self.delimiter) > 1):
45 errors.append("delimiter must be one-character string")
46
47 if self.quotechar is None:
48 if self.quoting != QUOTE_NONE:
49 errors.append("quotechar not set")
50 elif (not isinstance(self.quotechar, str) or
51 len(self.quotechar) > 1):
52 errors.append("quotechar must be one-character string")
53
54 if self.lineterminator is None:
55 errors.append("lineterminator not set")
56 elif not isinstance(self.lineterminator, str):
57 errors.append("lineterminator must be a string")
58
59 if self.doublequote not in (True, False):
60 errors.append("doublequote parameter must be True or False")
61
62 if self.skipinitialspace not in (True, False):
63 errors.append("skipinitialspace parameter must be True or False")
64
65 if self.quoting is None:
66 errors.append("quoting parameter not set")
67
68 if self.quoting is QUOTE_NONE:
69 if (not isinstance(self.escapechar, (unicode, str)) or
70 len(self.escapechar) > 1):
71 errors.append("escapechar must be a one-character string or unicode object")
72
73 return errors
74
75class excel(Dialect):
76 delimiter = ','
77 quotechar = '"'
78 doublequote = True
79 skipinitialspace = False
80 lineterminator = '\r\n'
81 quoting = QUOTE_MINIMAL
82register_dialect("excel", excel)
83
84class excel_tab(excel):
85 delimiter = '\t'
86register_dialect("excel-tab", excel_tab)
87
88
89class DictReader:
90 def __init__(self, f, fieldnames, restkey=None, restval=None,
91 dialect="excel", *args):
92 self.fieldnames = fieldnames # list of keys for the dict
93 self.restkey = restkey # key to catch long rows
94 self.restval = restval # default value for short rows
95 self.reader = reader(f, dialect, *args)
96
97 def __iter__(self):
98 return self
99
100 def next(self):
101 row = self.reader.next()
102 # unlike the basic reader, we prefer not to return blanks,
103 # because we will typically wind up with a dict full of None
104 # values
105 while row == []:
106 row = self.reader.next()
107 d = dict(zip(self.fieldnames, row))
108 lf = len(self.fieldnames)
109 lr = len(row)
110 if lf < lr:
111 d[self.restkey] = row[lf:]
112 elif lf > lr:
113 for key in self.fieldnames[lr:]:
114 d[key] = self.restval
115 return d
116
117
118class DictWriter:
119 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
120 dialect="excel", *args):
121 self.fieldnames = fieldnames # list of keys for the dict
122 self.restval = restval # for writing short dicts
123 if extrasaction.lower() not in ("raise", "ignore"):
124 raise ValueError, \
125 ("extrasaction (%s) must be 'raise' or 'ignore'" %
126 extrasaction)
127 self.extrasaction = extrasaction
128 self.writer = writer(f, dialect, *args)
129
130 def _dict_to_list(self, rowdict):
131 if self.extrasaction == "raise":
132 for k in rowdict.keys():
133 if k not in self.fieldnames:
134 raise ValueError, "dict contains fields not in fieldnames"
135 return [rowdict.get(key, self.restval) for key in self.fieldnames]
136
137 def writerow(self, rowdict):
138 return self.writer.writerow(self._dict_to_list(rowdict))
139
140 def writerows(self, rowdicts):
141 rows = []
142 for rowdict in rowdicts:
143 rows.append(self._dict_to_list(rowdict))
144 return self.writer.writerows(rows)
145
146
147class Sniffer:
148 '''
149 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
150 Returns a csv.Dialect object.
151 '''
152 def __init__(self, sample = 16 * 1024):
153 # in case there is more than one possible delimiter
154 self.preferred = [',', '\t', ';', ' ', ':']
155
156 # amount of data (in bytes) to sample
157 self.sample = sample
158
159
160 def sniff(self, fileobj):
161 """
162 Takes a file-like object and returns a dialect (or None)
163 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 self.fileobj = fileobj
165
166 data = fileobj.read(self.sample)
167
168 quotechar, delimiter, skipinitialspace = \
169 self._guessQuoteAndDelimiter(data)
170 if delimiter is None:
171 delimiter, skipinitialspace = self._guessDelimiter(data)
172
Fred Drake7c852f32003-04-25 14:27:00 +0000173 class SniffedDialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 _name = "sniffed"
175 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000176 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177 # escapechar = ''
178 doublequote = False
Fred Drake7c852f32003-04-25 14:27:00 +0000179 SniffedDialect.delimiter = delimiter
180 SniffedDialect.quotechar = quotechar
181 SniffedDialect.skipinitialspace = skipinitialspace
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182
Fred Drake7c852f32003-04-25 14:27:00 +0000183 self.dialect = SniffedDialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000184 return self.dialect
185
186
187 def hasHeaders(self):
188 return self._hasHeaders(self.fileobj, self.dialect)
189
190
Fred Drake7c852f32003-04-25 14:27:00 +0000191 def register_dialect(self, name='sniffed'):
192 register_dialect(name, self.dialect)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000193
194
195 def _guessQuoteAndDelimiter(self, data):
196 """
197 Looks for text enclosed between two identical quotes
198 (the probable quotechar) which are preceded and followed
199 by the same character (the probable delimiter).
200 For example:
201 ,'some text',
202 The quote with the most wins, same with the delimiter.
203 If there is no quotechar the delimiter can't be determined
204 this way.
205 """
206
207 matches = []
208 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
209 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
210 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
212 regexp = re.compile(restr, re.S | re.M)
213 matches = regexp.findall(data)
214 if matches:
215 break
216
217 if not matches:
218 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
219
220 quotes = {}
221 delims = {}
222 spaces = 0
223 for m in matches:
224 n = regexp.groupindex['quote'] - 1
225 key = m[n]
226 if key:
227 quotes[key] = quotes.get(key, 0) + 1
228 try:
229 n = regexp.groupindex['delim'] - 1
230 key = m[n]
231 except KeyError:
232 continue
233 if key:
234 delims[key] = delims.get(key, 0) + 1
235 try:
236 n = regexp.groupindex['space'] - 1
237 except KeyError:
238 continue
239 if m[n]:
240 spaces += 1
241
242 quotechar = reduce(lambda a, b, quotes = quotes:
243 (quotes[a] > quotes[b]) and a or b, quotes.keys())
244
245 if delims:
246 delim = reduce(lambda a, b, delims = delims:
247 (delims[a] > delims[b]) and a or b, delims.keys())
248 skipinitialspace = delims[delim] == spaces
249 if delim == '\n': # most likely a file with a single column
250 delim = ''
251 else:
252 # there is *no* delimiter, it's a single column of quoted data
253 delim = ''
254 skipinitialspace = 0
255
256 return (quotechar, delim, skipinitialspace)
257
258
259 def _guessDelimiter(self, data):
260 """
261 The delimiter /should/ occur the same number of times on
262 each row. However, due to malformed data, it may not. We don't want
263 an all or nothing approach, so we allow for small variations in this
264 number.
265 1) build a table of the frequency of each character on every line.
266 2) build a table of freqencies of this frequency (meta-frequency?),
267 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
268 7 times in 2 rows'
269 3) use the mode of the meta-frequency to determine the /expected/
270 frequency for that character
271 4) find out how often the character actually meets that goal
272 5) the character that best meets its goal is the delimiter
273 For performance reasons, the data is evaluated in chunks, so it can
274 try and evaluate the smallest portion of the data possible, evaluating
275 additional chunks as necessary.
276 """
277
278 data = filter(None, data.split('\n'))
279
280 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
281
282 # build frequency tables
283 chunkLength = min(10, len(data))
284 iteration = 0
285 charFrequency = {}
286 modes = {}
287 delims = {}
288 start, end = 0, min(chunkLength, len(data))
289 while start < len(data):
290 iteration += 1
291 for line in data[start:end]:
292 for char in ascii:
293 metafrequency = charFrequency.get(char, {})
294 # must count even if frequency is 0
295 freq = line.strip().count(char)
296 # value is the mode
297 metafrequency[freq] = metafrequency.get(freq, 0) + 1
298 charFrequency[char] = metafrequency
299
300 for char in charFrequency.keys():
301 items = charFrequency[char].items()
302 if len(items) == 1 and items[0][0] == 0:
303 continue
304 # get the mode of the frequencies
305 if len(items) > 1:
306 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
307 items)
308 # adjust the mode - subtract the sum of all
309 # other frequencies
310 items.remove(modes[char])
311 modes[char] = (modes[char][0], modes[char][1]
312 - reduce(lambda a, b: (0, a[1] + b[1]),
313 items)[1])
314 else:
315 modes[char] = items[0]
316
317 # build a list of possible delimiters
318 modeList = modes.items()
319 total = float(chunkLength * iteration)
320 # (rows of consistent data) / (number of rows) = 100%
321 consistency = 1.0
322 # minimum consistency threshold
323 threshold = 0.9
324 while len(delims) == 0 and consistency >= threshold:
325 for k, v in modeList:
326 if v[0] > 0 and v[1] > 0:
327 if (v[1]/total) >= consistency:
328 delims[k] = v
329 consistency -= 0.01
330
331 if len(delims) == 1:
332 delim = delims.keys()[0]
333 skipinitialspace = (data[0].count(delim) ==
334 data[0].count("%c " % delim))
335 return (delim, skipinitialspace)
336
337 # analyze another chunkLength lines
338 start = end
339 end += chunkLength
340
341 if not delims:
342 return ('', 0)
343
344 # if there's more than one, fall back to a 'preferred' list
345 if len(delims) > 1:
346 for d in self.preferred:
347 if d in delims.keys():
348 skipinitialspace = (data[0].count(d) ==
349 data[0].count("%c " % d))
350 return (d, skipinitialspace)
351
352 # finally, just return the first damn character in the list
353 delim = delims.keys()[0]
354 skipinitialspace = (data[0].count(delim) ==
355 data[0].count("%c " % delim))
356 return (delim, skipinitialspace)
357
358
359 def _hasHeaders(self, fileobj, dialect):
360 # Creates a dictionary of types of data in each column. If any
361 # column is of a single type (say, integers), *except* for the first
362 # row, then the first row is presumed to be labels. If the type
363 # can't be determined, it is assumed to be a string in which case
364 # the length of the string is the determining factor: if all of the
365 # rows except for the first are the same length, it's a header.
366 # Finally, a 'vote' is taken at the end for each column, adding or
367 # subtracting from the likelihood of the first row being a header.
368
369 def seval(item):
370 """
371 Strips parens from item prior to calling eval in an
372 attempt to make it safer
373 """
374 return eval(item.replace('(', '').replace(')', ''))
375
376 # rewind the fileobj - this might not work for some file-like
377 # objects...
378 fileobj.seek(0)
379
Fred Drake7c852f32003-04-25 14:27:00 +0000380 r = csv.reader(fileobj,
381 delimiter=dialect.delimiter,
382 quotechar=dialect.quotechar,
383 skipinitialspace=dialect.skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000384
Fred Drake7c852f32003-04-25 14:27:00 +0000385 header = r.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000386
387 columns = len(header)
388 columnTypes = {}
389 for i in range(columns): columnTypes[i] = None
390
391 checked = 0
Fred Drake7c852f32003-04-25 14:27:00 +0000392 for row in r:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393 # arbitrary number of rows to check, to keep it sane
394 if checked > 20:
395 break
396 checked += 1
397
398 if len(row) != columns:
399 continue # skip rows that have irregular number of columns
400
401 for col in columnTypes.keys():
402 try:
403 try:
404 # is it a built-in type (besides string)?
405 thisType = type(seval(row[col]))
406 except OverflowError:
407 # a long int?
408 thisType = type(seval(row[col] + 'L'))
409 thisType = type(0) # treat long ints as int
410 except:
411 # fallback to length of string
412 thisType = len(row[col])
413
414 if thisType != columnTypes[col]:
415 if columnTypes[col] is None: # add new column type
416 columnTypes[col] = thisType
417 else:
418 # type is inconsistent, remove column from
419 # consideration
420 del columnTypes[col]
421
422 # finally, compare results against first row and "vote"
423 # on whether it's a header
424 hasHeader = 0
425 for col, colType in columnTypes.items():
426 if type(colType) == type(0): # it's a length
427 if len(header[col]) != colType:
428 hasHeader += 1
429 else:
430 hasHeader -= 1
431 else: # attempt typecast
432 try:
433 eval("%s(%s)" % (colType.__name__, header[col]))
434 except:
435 hasHeader += 1
436 else:
437 hasHeader -= 1
438
439 return hasHeader > 0