blob: 83b8aa447def68483c1dc7bb7e910e7856b02432 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
Skip Montanaro1448d472003-04-25 14:47:16 +000012try:
13 from cStringIO import StringIO
14except ImportError:
15 from StringIO import StringIO
16
Skip Montanaro04ae7052003-04-24 20:21:31 +000017__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
23 _name = ""
24 _valid = False
25 # placeholders
26 delimiter = None
27 quotechar = None
28 escapechar = None
29 doublequote = None
30 skipinitialspace = None
31 lineterminator = None
32 quoting = None
33
34 def __init__(self):
35 if self.__class__ != Dialect:
36 self._valid = True
37 errors = self._validate()
38 if errors != []:
39 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
40
41 def _validate(self):
42 errors = []
43 if not self._valid:
44 errors.append("can't directly instantiate Dialect class")
45
46 if self.delimiter is None:
47 errors.append("delimiter character not set")
48 elif (not isinstance(self.delimiter, str) or
49 len(self.delimiter) > 1):
50 errors.append("delimiter must be one-character string")
51
52 if self.quotechar is None:
53 if self.quoting != QUOTE_NONE:
54 errors.append("quotechar not set")
55 elif (not isinstance(self.quotechar, str) or
56 len(self.quotechar) > 1):
57 errors.append("quotechar must be one-character string")
58
59 if self.lineterminator is None:
60 errors.append("lineterminator not set")
61 elif not isinstance(self.lineterminator, str):
62 errors.append("lineterminator must be a string")
63
64 if self.doublequote not in (True, False):
65 errors.append("doublequote parameter must be True or False")
66
67 if self.skipinitialspace not in (True, False):
68 errors.append("skipinitialspace parameter must be True or False")
69
70 if self.quoting is None:
71 errors.append("quoting parameter not set")
72
73 if self.quoting is QUOTE_NONE:
74 if (not isinstance(self.escapechar, (unicode, str)) or
75 len(self.escapechar) > 1):
76 errors.append("escapechar must be a one-character string or unicode object")
77
78 return errors
79
80class excel(Dialect):
81 delimiter = ','
82 quotechar = '"'
83 doublequote = True
84 skipinitialspace = False
85 lineterminator = '\r\n'
86 quoting = QUOTE_MINIMAL
87register_dialect("excel", excel)
88
89class excel_tab(excel):
90 delimiter = '\t'
91register_dialect("excel-tab", excel_tab)
92
93
94class DictReader:
95 def __init__(self, f, fieldnames, restkey=None, restval=None,
96 dialect="excel", *args):
97 self.fieldnames = fieldnames # list of keys for the dict
98 self.restkey = restkey # key to catch long rows
99 self.restval = restval # default value for short rows
100 self.reader = reader(f, dialect, *args)
101
102 def __iter__(self):
103 return self
104
105 def next(self):
106 row = self.reader.next()
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125 dialect="excel", *args):
126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
133 self.writer = writer(f, dialect, *args)
134
135 def _dict_to_list(self, rowdict):
136 if self.extrasaction == "raise":
137 for k in rowdict.keys():
138 if k not in self.fieldnames:
139 raise ValueError, "dict contains fields not in fieldnames"
140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
141
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
144
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
150
151
152class Sniffer:
153 '''
154 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000155 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000157 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158 # in case there is more than one possible delimiter
159 self.preferred = [',', '\t', ';', ' ', ':']
160
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161
Skip Montanaro77892372003-05-19 15:33:36 +0000162 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000164 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
167 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000168 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000170 delimiter, skipinitialspace = self._guess_delimiter(sample,
171 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
Skip Montanaro1448d472003-04-25 14:47:16 +0000173 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 _name = "sniffed"
175 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000176 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177 # escapechar = ''
178 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179
Skip Montanaro1448d472003-04-25 14:47:16 +0000180 dialect.delimiter = delimiter
181 # _csv.reader won't accept a quotechar of ''
182 dialect.quotechar = quotechar or '"'
183 dialect.skipinitialspace = skipinitialspace
184
185 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186
187
Skip Montanaro77892372003-05-19 15:33:36 +0000188 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000189 """
190 Looks for text enclosed between two identical quotes
191 (the probable quotechar) which are preceded and followed
192 by the same character (the probable delimiter).
193 For example:
194 ,'some text',
195 The quote with the most wins, same with the delimiter.
196 If there is no quotechar the delimiter can't be determined
197 this way.
198 """
199
200 matches = []
201 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
202 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
203 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
204 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
205 regexp = re.compile(restr, re.S | re.M)
206 matches = regexp.findall(data)
207 if matches:
208 break
209
210 if not matches:
211 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
212
213 quotes = {}
214 delims = {}
215 spaces = 0
216 for m in matches:
217 n = regexp.groupindex['quote'] - 1
218 key = m[n]
219 if key:
220 quotes[key] = quotes.get(key, 0) + 1
221 try:
222 n = regexp.groupindex['delim'] - 1
223 key = m[n]
224 except KeyError:
225 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000226 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000227 delims[key] = delims.get(key, 0) + 1
228 try:
229 n = regexp.groupindex['space'] - 1
230 except KeyError:
231 continue
232 if m[n]:
233 spaces += 1
234
235 quotechar = reduce(lambda a, b, quotes = quotes:
236 (quotes[a] > quotes[b]) and a or b, quotes.keys())
237
238 if delims:
239 delim = reduce(lambda a, b, delims = delims:
240 (delims[a] > delims[b]) and a or b, delims.keys())
241 skipinitialspace = delims[delim] == spaces
242 if delim == '\n': # most likely a file with a single column
243 delim = ''
244 else:
245 # there is *no* delimiter, it's a single column of quoted data
246 delim = ''
247 skipinitialspace = 0
248
249 return (quotechar, delim, skipinitialspace)
250
251
Skip Montanaro77892372003-05-19 15:33:36 +0000252 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000253 """
254 The delimiter /should/ occur the same number of times on
255 each row. However, due to malformed data, it may not. We don't want
256 an all or nothing approach, so we allow for small variations in this
257 number.
258 1) build a table of the frequency of each character on every line.
259 2) build a table of freqencies of this frequency (meta-frequency?),
260 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
261 7 times in 2 rows'
262 3) use the mode of the meta-frequency to determine the /expected/
263 frequency for that character
264 4) find out how often the character actually meets that goal
265 5) the character that best meets its goal is the delimiter
266 For performance reasons, the data is evaluated in chunks, so it can
267 try and evaluate the smallest portion of the data possible, evaluating
268 additional chunks as necessary.
269 """
270
271 data = filter(None, data.split('\n'))
272
273 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
274
275 # build frequency tables
276 chunkLength = min(10, len(data))
277 iteration = 0
278 charFrequency = {}
279 modes = {}
280 delims = {}
281 start, end = 0, min(chunkLength, len(data))
282 while start < len(data):
283 iteration += 1
284 for line in data[start:end]:
285 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000286 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000287 # must count even if frequency is 0
288 freq = line.strip().count(char)
289 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000290 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
291 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000292
293 for char in charFrequency.keys():
294 items = charFrequency[char].items()
295 if len(items) == 1 and items[0][0] == 0:
296 continue
297 # get the mode of the frequencies
298 if len(items) > 1:
299 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
300 items)
301 # adjust the mode - subtract the sum of all
302 # other frequencies
303 items.remove(modes[char])
304 modes[char] = (modes[char][0], modes[char][1]
305 - reduce(lambda a, b: (0, a[1] + b[1]),
306 items)[1])
307 else:
308 modes[char] = items[0]
309
310 # build a list of possible delimiters
311 modeList = modes.items()
312 total = float(chunkLength * iteration)
313 # (rows of consistent data) / (number of rows) = 100%
314 consistency = 1.0
315 # minimum consistency threshold
316 threshold = 0.9
317 while len(delims) == 0 and consistency >= threshold:
318 for k, v in modeList:
319 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000320 if ((v[1]/total) >= consistency and
321 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000322 delims[k] = v
323 consistency -= 0.01
324
325 if len(delims) == 1:
326 delim = delims.keys()[0]
327 skipinitialspace = (data[0].count(delim) ==
328 data[0].count("%c " % delim))
329 return (delim, skipinitialspace)
330
331 # analyze another chunkLength lines
332 start = end
333 end += chunkLength
334
335 if not delims:
336 return ('', 0)
337
338 # if there's more than one, fall back to a 'preferred' list
339 if len(delims) > 1:
340 for d in self.preferred:
341 if d in delims.keys():
342 skipinitialspace = (data[0].count(d) ==
343 data[0].count("%c " % d))
344 return (d, skipinitialspace)
345
346 # finally, just return the first damn character in the list
347 delim = delims.keys()[0]
348 skipinitialspace = (data[0].count(delim) ==
349 data[0].count("%c " % delim))
350 return (delim, skipinitialspace)
351
352
Skip Montanaro1448d472003-04-25 14:47:16 +0000353 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000354 # Creates a dictionary of types of data in each column. If any
355 # column is of a single type (say, integers), *except* for the first
356 # row, then the first row is presumed to be labels. If the type
357 # can't be determined, it is assumed to be a string in which case
358 # the length of the string is the determining factor: if all of the
359 # rows except for the first are the same length, it's a header.
360 # Finally, a 'vote' is taken at the end for each column, adding or
361 # subtracting from the likelihood of the first row being a header.
362
363 def seval(item):
364 """
365 Strips parens from item prior to calling eval in an
366 attempt to make it safer
367 """
368 return eval(item.replace('(', '').replace(')', ''))
369
Skip Montanaro1448d472003-04-25 14:47:16 +0000370 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000371
Skip Montanaro1448d472003-04-25 14:47:16 +0000372 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000373
374 columns = len(header)
375 columnTypes = {}
376 for i in range(columns): columnTypes[i] = None
377
378 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000379 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000380 # arbitrary number of rows to check, to keep it sane
381 if checked > 20:
382 break
383 checked += 1
384
385 if len(row) != columns:
386 continue # skip rows that have irregular number of columns
387
388 for col in columnTypes.keys():
389 try:
390 try:
391 # is it a built-in type (besides string)?
392 thisType = type(seval(row[col]))
393 except OverflowError:
394 # a long int?
395 thisType = type(seval(row[col] + 'L'))
396 thisType = type(0) # treat long ints as int
397 except:
398 # fallback to length of string
399 thisType = len(row[col])
400
401 if thisType != columnTypes[col]:
402 if columnTypes[col] is None: # add new column type
403 columnTypes[col] = thisType
404 else:
405 # type is inconsistent, remove column from
406 # consideration
407 del columnTypes[col]
408
409 # finally, compare results against first row and "vote"
410 # on whether it's a header
411 hasHeader = 0
412 for col, colType in columnTypes.items():
413 if type(colType) == type(0): # it's a length
414 if len(header[col]) != colType:
415 hasHeader += 1
416 else:
417 hasHeader -= 1
418 else: # attempt typecast
419 try:
420 eval("%s(%s)" % (colType.__name__, header[col]))
421 except:
422 hasHeader += 1
423 else:
424 hasHeader -= 1
425
426 return hasHeader > 0