blob: 3db5dac5db666380c03d77473fa05efcbbb075bf [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
Brett Cannon9fc56312008-08-09 23:39:11 +00007from functools import reduce
Skip Montanaro04ae7052003-04-24 20:21:31 +00008from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +000010 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000011 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000013from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000014
Skip Montanaro1448d472003-04-25 14:47:16 +000015try:
16 from cStringIO import StringIO
17except ImportError:
18 from StringIO import StringIO
19
Skip Montanaro04ae7052003-04-24 20:21:31 +000020__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannon88f801d2008-08-18 00:46:22 +000021 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000023 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000027 """Describe an Excel dialect.
28
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
32
33 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000034 _name = ""
35 _valid = False
36 # placeholders
37 delimiter = None
38 quotechar = None
39 escapechar = None
40 doublequote = None
41 skipinitialspace = None
42 lineterminator = None
43 quoting = None
44
45 def __init__(self):
46 if self.__class__ != Dialect:
47 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000048 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000049
50 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000051 try:
52 _Dialect(self)
53 except TypeError, e:
54 # We do this for compatibility with py2.3
55 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000056
57class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000058 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000059 delimiter = ','
60 quotechar = '"'
61 doublequote = True
62 skipinitialspace = False
63 lineterminator = '\r\n'
64 quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000068 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000069 delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000074 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 dialect="excel", *args, **kwds):
Skip Montanaroa032bf42008-08-08 22:52:51 +000076 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000077 self.restkey = restkey # key to catch long rows
78 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000079 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000080 self.dialect = dialect
81 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000082
83 def __iter__(self):
84 return self
85
Skip Montanaroa032bf42008-08-08 22:52:51 +000086 @property
87 def fieldnames(self):
88 if self._fieldnames is None:
89 try:
90 self._fieldnames = self.reader.next()
91 except StopIteration:
92 pass
93 self.line_num = self.reader.line_num
94 return self._fieldnames
95
96 @fieldnames.setter
97 def fieldnames(self, value):
98 self._fieldnames = value
99
Skip Montanaro04ae7052003-04-24 20:21:31 +0000100 def next(self):
Skip Montanaroa032bf42008-08-08 22:52:51 +0000101 if self.line_num == 0:
102 # Used only for its side effect.
103 self.fieldnames
Skip Montanaro04ae7052003-04-24 20:21:31 +0000104 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +0000105 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000106
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000125 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000133 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000134
135 def _dict_to_list(self, rowdict):
136 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000137 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
138 if wrong_fields:
139 raise ValueError("dict contains fields not in fieldnames: " +
140 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000141 return [rowdict.get(key, self.restval) for key in self.fieldnames]
142
143 def writerow(self, rowdict):
144 return self.writer.writerow(self._dict_to_list(rowdict))
145
146 def writerows(self, rowdicts):
147 rows = []
148 for rowdict in rowdicts:
149 rows.append(self._dict_to_list(rowdict))
150 return self.writer.writerows(rows)
151
Raymond Hettinger39a55922003-06-12 03:01:55 +0000152# Guard Sniffer's type checking against builds that exclude complex()
153try:
154 complex
155except NameError:
156 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000157
158class Sniffer:
159 '''
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000161 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000163 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 # in case there is more than one possible delimiter
165 self.preferred = [',', '\t', ';', ' ', ':']
166
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167
Skip Montanaro77892372003-05-19 15:33:36 +0000168 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000170 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000173 quotechar, doublequote, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000174 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000175 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000176 delimiter, skipinitialspace = self._guess_delimiter(sample,
177 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178
Skip Montanaro39b29be2005-12-30 05:09:48 +0000179 if not delimiter:
180 raise Error, "Could not determine delimiter"
181
Skip Montanaro1448d472003-04-25 14:47:16 +0000182 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000183 _name = "sniffed"
184 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000185 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186 # escapechar = ''
Skip Montanaro04ae7052003-04-24 20:21:31 +0000187
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000188 dialect.doublequote = doublequote
Skip Montanaro1448d472003-04-25 14:47:16 +0000189 dialect.delimiter = delimiter
190 # _csv.reader won't accept a quotechar of ''
191 dialect.quotechar = quotechar or '"'
192 dialect.skipinitialspace = skipinitialspace
193
194 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195
196
Skip Montanaro77892372003-05-19 15:33:36 +0000197 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000198 """
199 Looks for text enclosed between two identical quotes
200 (the probable quotechar) which are preceded and followed
201 by the same character (the probable delimiter).
202 For example:
203 ,'some text',
204 The quote with the most wins, same with the delimiter.
205 If there is no quotechar the delimiter can't be determined
206 this way.
207 """
208
209 matches = []
210 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000214 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000215 matches = regexp.findall(data)
216 if matches:
217 break
218
219 if not matches:
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000220 # (quotechar, doublequote, delimiter, skipinitialspace)
221 return ('', False, None, 0)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000222 quotes = {}
223 delims = {}
224 spaces = 0
225 for m in matches:
226 n = regexp.groupindex['quote'] - 1
227 key = m[n]
228 if key:
229 quotes[key] = quotes.get(key, 0) + 1
230 try:
231 n = regexp.groupindex['delim'] - 1
232 key = m[n]
233 except KeyError:
234 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000235 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000236 delims[key] = delims.get(key, 0) + 1
237 try:
238 n = regexp.groupindex['space'] - 1
239 except KeyError:
240 continue
241 if m[n]:
242 spaces += 1
243
244 quotechar = reduce(lambda a, b, quotes = quotes:
245 (quotes[a] > quotes[b]) and a or b, quotes.keys())
246
247 if delims:
248 delim = reduce(lambda a, b, delims = delims:
249 (delims[a] > delims[b]) and a or b, delims.keys())
250 skipinitialspace = delims[delim] == spaces
251 if delim == '\n': # most likely a file with a single column
252 delim = ''
253 else:
254 # there is *no* delimiter, it's a single column of quoted data
255 delim = ''
256 skipinitialspace = 0
257
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000258 # if we see an extra quote between delimiters, we've got a
259 # double quoted format
260 dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
261 {'delim':delim, 'quote':quotechar}, re.MULTILINE)
262
263
264
265 if dq_regexp.search(data):
266 doublequote = True
267 else:
268 doublequote = False
269
270 return (quotechar, doublequote, delim, skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000271
272
Skip Montanaro77892372003-05-19 15:33:36 +0000273 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000274 """
275 The delimiter /should/ occur the same number of times on
276 each row. However, due to malformed data, it may not. We don't want
277 an all or nothing approach, so we allow for small variations in this
278 number.
279 1) build a table of the frequency of each character on every line.
280 2) build a table of freqencies of this frequency (meta-frequency?),
281 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
282 7 times in 2 rows'
283 3) use the mode of the meta-frequency to determine the /expected/
284 frequency for that character
285 4) find out how often the character actually meets that goal
286 5) the character that best meets its goal is the delimiter
287 For performance reasons, the data is evaluated in chunks, so it can
288 try and evaluate the smallest portion of the data possible, evaluating
289 additional chunks as necessary.
290 """
291
292 data = filter(None, data.split('\n'))
293
294 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
295
296 # build frequency tables
297 chunkLength = min(10, len(data))
298 iteration = 0
299 charFrequency = {}
300 modes = {}
301 delims = {}
302 start, end = 0, min(chunkLength, len(data))
303 while start < len(data):
304 iteration += 1
305 for line in data[start:end]:
306 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000307 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000308 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000309 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000310 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000311 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
312 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000313
314 for char in charFrequency.keys():
315 items = charFrequency[char].items()
316 if len(items) == 1 and items[0][0] == 0:
317 continue
318 # get the mode of the frequencies
319 if len(items) > 1:
320 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
321 items)
322 # adjust the mode - subtract the sum of all
323 # other frequencies
324 items.remove(modes[char])
325 modes[char] = (modes[char][0], modes[char][1]
326 - reduce(lambda a, b: (0, a[1] + b[1]),
327 items)[1])
328 else:
329 modes[char] = items[0]
330
331 # build a list of possible delimiters
332 modeList = modes.items()
333 total = float(chunkLength * iteration)
334 # (rows of consistent data) / (number of rows) = 100%
335 consistency = 1.0
336 # minimum consistency threshold
337 threshold = 0.9
338 while len(delims) == 0 and consistency >= threshold:
339 for k, v in modeList:
340 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000341 if ((v[1]/total) >= consistency and
342 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000343 delims[k] = v
344 consistency -= 0.01
345
346 if len(delims) == 1:
347 delim = delims.keys()[0]
348 skipinitialspace = (data[0].count(delim) ==
349 data[0].count("%c " % delim))
350 return (delim, skipinitialspace)
351
352 # analyze another chunkLength lines
353 start = end
354 end += chunkLength
355
356 if not delims:
357 return ('', 0)
358
359 # if there's more than one, fall back to a 'preferred' list
360 if len(delims) > 1:
361 for d in self.preferred:
362 if d in delims.keys():
363 skipinitialspace = (data[0].count(d) ==
364 data[0].count("%c " % d))
365 return (d, skipinitialspace)
366
Skip Montanaro39b29be2005-12-30 05:09:48 +0000367 # nothing else indicates a preference, pick the character that
368 # dominates(?)
369 items = [(v,k) for (k,v) in delims.items()]
370 items.sort()
371 delim = items[-1][1]
372
Skip Montanaro04ae7052003-04-24 20:21:31 +0000373 skipinitialspace = (data[0].count(delim) ==
374 data[0].count("%c " % delim))
375 return (delim, skipinitialspace)
376
377
Skip Montanaro1448d472003-04-25 14:47:16 +0000378 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000379 # Creates a dictionary of types of data in each column. If any
380 # column is of a single type (say, integers), *except* for the first
381 # row, then the first row is presumed to be labels. If the type
382 # can't be determined, it is assumed to be a string in which case
383 # the length of the string is the determining factor: if all of the
384 # rows except for the first are the same length, it's a header.
385 # Finally, a 'vote' is taken at the end for each column, adding or
386 # subtracting from the likelihood of the first row being a header.
387
Skip Montanaro1448d472003-04-25 14:47:16 +0000388 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000389
Skip Montanaro1448d472003-04-25 14:47:16 +0000390 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000391
392 columns = len(header)
393 columnTypes = {}
394 for i in range(columns): columnTypes[i] = None
395
396 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000397 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000398 # arbitrary number of rows to check, to keep it sane
399 if checked > 20:
400 break
401 checked += 1
402
403 if len(row) != columns:
404 continue # skip rows that have irregular number of columns
405
406 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000407
408 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000409 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000410 thisType(row[col])
411 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000412 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000413 pass
414 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000415 # fallback to length of string
416 thisType = len(row[col])
417
Raymond Hettinger39a55922003-06-12 03:01:55 +0000418 # treat longs as ints
419 if thisType == long:
420 thisType = int
421
Skip Montanaro04ae7052003-04-24 20:21:31 +0000422 if thisType != columnTypes[col]:
423 if columnTypes[col] is None: # add new column type
424 columnTypes[col] = thisType
425 else:
426 # type is inconsistent, remove column from
427 # consideration
428 del columnTypes[col]
429
430 # finally, compare results against first row and "vote"
431 # on whether it's a header
432 hasHeader = 0
433 for col, colType in columnTypes.items():
434 if type(colType) == type(0): # it's a length
435 if len(header[col]) != colType:
436 hasHeader += 1
437 else:
438 hasHeader -= 1
439 else: # attempt typecast
440 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000441 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000442 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000443 hasHeader += 1
444 else:
445 hasHeader -= 1
446
447 return hasHeader > 0