blob: c155ada794a606f26fb105d8227d100bbd986e1a [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
Brett Cannon9fc56312008-08-09 23:39:11 +00007from functools import reduce
Skip Montanaro04ae7052003-04-24 20:21:31 +00008from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +000010 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000011 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000013from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000014
Skip Montanaro1448d472003-04-25 14:47:16 +000015try:
16 from cStringIO import StringIO
17except ImportError:
18 from StringIO import StringIO
19
Skip Montanaro04ae7052003-04-24 20:21:31 +000020__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannon88f801d2008-08-18 00:46:22 +000021 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000023 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000027 """Describe an Excel dialect.
28
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
32
33 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000034 _name = ""
35 _valid = False
36 # placeholders
37 delimiter = None
38 quotechar = None
39 escapechar = None
40 doublequote = None
41 skipinitialspace = None
42 lineterminator = None
43 quoting = None
44
45 def __init__(self):
46 if self.__class__ != Dialect:
47 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000048 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000049
50 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000051 try:
52 _Dialect(self)
53 except TypeError, e:
54 # We do this for compatibility with py2.3
55 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000056
57class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000058 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000059 delimiter = ','
60 quotechar = '"'
61 doublequote = True
62 skipinitialspace = False
63 lineterminator = '\r\n'
64 quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000068 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000069 delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000074 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 dialect="excel", *args, **kwds):
Skip Montanaroa032bf42008-08-08 22:52:51 +000076 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000077 self.restkey = restkey # key to catch long rows
78 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000079 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000080 self.dialect = dialect
81 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000082
83 def __iter__(self):
84 return self
85
Skip Montanaroa032bf42008-08-08 22:52:51 +000086 @property
87 def fieldnames(self):
88 if self._fieldnames is None:
89 try:
90 self._fieldnames = self.reader.next()
91 except StopIteration:
92 pass
93 self.line_num = self.reader.line_num
94 return self._fieldnames
95
R David Murrayea76e872013-12-17 12:09:46 -050096 # Issue 20004: Because DictReader is a classic class, this setter is
97 # ignored. At this point in 2.7's lifecycle, it is too late to change the
98 # base class for fear of breaking working code. If you want to change
99 # fieldnames without overwriting the getter, set _fieldnames directly.
Skip Montanaroa032bf42008-08-08 22:52:51 +0000100 @fieldnames.setter
101 def fieldnames(self, value):
102 self._fieldnames = value
103
Skip Montanaro04ae7052003-04-24 20:21:31 +0000104 def next(self):
Skip Montanaroa032bf42008-08-08 22:52:51 +0000105 if self.line_num == 0:
106 # Used only for its side effect.
107 self.fieldnames
Skip Montanaro04ae7052003-04-24 20:21:31 +0000108 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +0000109 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000110
Skip Montanaro04ae7052003-04-24 20:21:31 +0000111 # unlike the basic reader, we prefer not to return blanks,
112 # because we will typically wind up with a dict full of None
113 # values
114 while row == []:
115 row = self.reader.next()
116 d = dict(zip(self.fieldnames, row))
117 lf = len(self.fieldnames)
118 lr = len(row)
119 if lf < lr:
120 d[self.restkey] = row[lf:]
121 elif lf > lr:
122 for key in self.fieldnames[lr:]:
123 d[key] = self.restval
124 return d
125
126
127class DictWriter:
128 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000129 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000130 self.fieldnames = fieldnames # list of keys for the dict
131 self.restval = restval # for writing short dicts
132 if extrasaction.lower() not in ("raise", "ignore"):
133 raise ValueError, \
134 ("extrasaction (%s) must be 'raise' or 'ignore'" %
135 extrasaction)
136 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000137 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000138
Dirkjan Ochtman86148172010-02-23 21:09:52 +0000139 def writeheader(self):
140 header = dict(zip(self.fieldnames, self.fieldnames))
141 self.writerow(header)
142
Skip Montanaro04ae7052003-04-24 20:21:31 +0000143 def _dict_to_list(self, rowdict):
144 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000145 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
146 if wrong_fields:
R David Murrayeccf9c22013-11-19 13:25:24 -0500147 raise ValueError("dict contains fields not in fieldnames: "
148 + ", ".join([repr(x) for x in wrong_fields]))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149 return [rowdict.get(key, self.restval) for key in self.fieldnames]
150
151 def writerow(self, rowdict):
152 return self.writer.writerow(self._dict_to_list(rowdict))
153
154 def writerows(self, rowdicts):
155 rows = []
156 for rowdict in rowdicts:
157 rows.append(self._dict_to_list(rowdict))
158 return self.writer.writerows(rows)
159
Raymond Hettinger39a55922003-06-12 03:01:55 +0000160# Guard Sniffer's type checking against builds that exclude complex()
161try:
162 complex
163except NameError:
164 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165
166class Sniffer:
167 '''
168 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000171 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172 # in case there is more than one possible delimiter
173 self.preferred = [',', '\t', ';', ' ', ':']
174
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175
Skip Montanaro77892372003-05-19 15:33:36 +0000176 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000178 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000180
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000181 quotechar, doublequote, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000182 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000183 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000184 delimiter, skipinitialspace = self._guess_delimiter(sample,
185 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186
Skip Montanaro39b29be2005-12-30 05:09:48 +0000187 if not delimiter:
188 raise Error, "Could not determine delimiter"
189
Skip Montanaro1448d472003-04-25 14:47:16 +0000190 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000191 _name = "sniffed"
192 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000193 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000194 # escapechar = ''
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000196 dialect.doublequote = doublequote
Skip Montanaro1448d472003-04-25 14:47:16 +0000197 dialect.delimiter = delimiter
198 # _csv.reader won't accept a quotechar of ''
199 dialect.quotechar = quotechar or '"'
200 dialect.skipinitialspace = skipinitialspace
201
202 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000203
204
Skip Montanaro77892372003-05-19 15:33:36 +0000205 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000206 """
207 Looks for text enclosed between two identical quotes
208 (the probable quotechar) which are preceded and followed
209 by the same character (the probable delimiter).
210 For example:
211 ,'some text',
212 The quote with the most wins, same with the delimiter.
213 If there is no quotechar the delimiter can't be determined
214 this way.
215 """
216
217 matches = []
218 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
219 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
220 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
221 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000222 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000223 matches = regexp.findall(data)
224 if matches:
225 break
226
227 if not matches:
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000228 # (quotechar, doublequote, delimiter, skipinitialspace)
229 return ('', False, None, 0)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000230 quotes = {}
231 delims = {}
232 spaces = 0
233 for m in matches:
234 n = regexp.groupindex['quote'] - 1
235 key = m[n]
236 if key:
237 quotes[key] = quotes.get(key, 0) + 1
238 try:
239 n = regexp.groupindex['delim'] - 1
240 key = m[n]
241 except KeyError:
242 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000243 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000244 delims[key] = delims.get(key, 0) + 1
245 try:
246 n = regexp.groupindex['space'] - 1
247 except KeyError:
248 continue
249 if m[n]:
250 spaces += 1
251
252 quotechar = reduce(lambda a, b, quotes = quotes:
253 (quotes[a] > quotes[b]) and a or b, quotes.keys())
254
255 if delims:
256 delim = reduce(lambda a, b, delims = delims:
257 (delims[a] > delims[b]) and a or b, delims.keys())
258 skipinitialspace = delims[delim] == spaces
259 if delim == '\n': # most likely a file with a single column
260 delim = ''
261 else:
262 # there is *no* delimiter, it's a single column of quoted data
263 delim = ''
264 skipinitialspace = 0
265
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000266 # if we see an extra quote between delimiters, we've got a
267 # double quoted format
R David Murray24dc7532013-06-29 18:43:59 -0400268 dq_regexp = re.compile(
269 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
270 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000271
272
273
274 if dq_regexp.search(data):
275 doublequote = True
276 else:
277 doublequote = False
278
279 return (quotechar, doublequote, delim, skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000280
281
Skip Montanaro77892372003-05-19 15:33:36 +0000282 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000283 """
284 The delimiter /should/ occur the same number of times on
285 each row. However, due to malformed data, it may not. We don't want
286 an all or nothing approach, so we allow for small variations in this
287 number.
288 1) build a table of the frequency of each character on every line.
Ezio Melottic2077b02011-03-16 12:34:31 +0200289 2) build a table of frequencies of this frequency (meta-frequency?),
Skip Montanaro04ae7052003-04-24 20:21:31 +0000290 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
291 7 times in 2 rows'
292 3) use the mode of the meta-frequency to determine the /expected/
293 frequency for that character
294 4) find out how often the character actually meets that goal
295 5) the character that best meets its goal is the delimiter
296 For performance reasons, the data is evaluated in chunks, so it can
297 try and evaluate the smallest portion of the data possible, evaluating
298 additional chunks as necessary.
299 """
300
301 data = filter(None, data.split('\n'))
302
303 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
304
305 # build frequency tables
306 chunkLength = min(10, len(data))
307 iteration = 0
308 charFrequency = {}
309 modes = {}
310 delims = {}
311 start, end = 0, min(chunkLength, len(data))
312 while start < len(data):
313 iteration += 1
314 for line in data[start:end]:
315 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000316 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000317 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000318 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000319 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000320 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
321 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000322
323 for char in charFrequency.keys():
324 items = charFrequency[char].items()
325 if len(items) == 1 and items[0][0] == 0:
326 continue
327 # get the mode of the frequencies
328 if len(items) > 1:
329 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
330 items)
331 # adjust the mode - subtract the sum of all
332 # other frequencies
333 items.remove(modes[char])
334 modes[char] = (modes[char][0], modes[char][1]
335 - reduce(lambda a, b: (0, a[1] + b[1]),
336 items)[1])
337 else:
338 modes[char] = items[0]
339
340 # build a list of possible delimiters
341 modeList = modes.items()
342 total = float(chunkLength * iteration)
343 # (rows of consistent data) / (number of rows) = 100%
344 consistency = 1.0
345 # minimum consistency threshold
346 threshold = 0.9
347 while len(delims) == 0 and consistency >= threshold:
348 for k, v in modeList:
349 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000350 if ((v[1]/total) >= consistency and
351 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000352 delims[k] = v
353 consistency -= 0.01
354
355 if len(delims) == 1:
356 delim = delims.keys()[0]
357 skipinitialspace = (data[0].count(delim) ==
358 data[0].count("%c " % delim))
359 return (delim, skipinitialspace)
360
361 # analyze another chunkLength lines
362 start = end
363 end += chunkLength
364
365 if not delims:
366 return ('', 0)
367
368 # if there's more than one, fall back to a 'preferred' list
369 if len(delims) > 1:
370 for d in self.preferred:
371 if d in delims.keys():
372 skipinitialspace = (data[0].count(d) ==
373 data[0].count("%c " % d))
374 return (d, skipinitialspace)
375
Skip Montanaro39b29be2005-12-30 05:09:48 +0000376 # nothing else indicates a preference, pick the character that
377 # dominates(?)
378 items = [(v,k) for (k,v) in delims.items()]
379 items.sort()
380 delim = items[-1][1]
381
Skip Montanaro04ae7052003-04-24 20:21:31 +0000382 skipinitialspace = (data[0].count(delim) ==
383 data[0].count("%c " % delim))
384 return (delim, skipinitialspace)
385
386
Skip Montanaro1448d472003-04-25 14:47:16 +0000387 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000388 # Creates a dictionary of types of data in each column. If any
389 # column is of a single type (say, integers), *except* for the first
390 # row, then the first row is presumed to be labels. If the type
391 # can't be determined, it is assumed to be a string in which case
392 # the length of the string is the determining factor: if all of the
393 # rows except for the first are the same length, it's a header.
394 # Finally, a 'vote' is taken at the end for each column, adding or
395 # subtracting from the likelihood of the first row being a header.
396
Skip Montanaro1448d472003-04-25 14:47:16 +0000397 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000398
Skip Montanaro1448d472003-04-25 14:47:16 +0000399 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000400
401 columns = len(header)
402 columnTypes = {}
403 for i in range(columns): columnTypes[i] = None
404
405 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000406 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000407 # arbitrary number of rows to check, to keep it sane
408 if checked > 20:
409 break
410 checked += 1
411
412 if len(row) != columns:
413 continue # skip rows that have irregular number of columns
414
415 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000416
417 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000418 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000419 thisType(row[col])
420 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000421 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000422 pass
423 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000424 # fallback to length of string
425 thisType = len(row[col])
426
Raymond Hettinger39a55922003-06-12 03:01:55 +0000427 # treat longs as ints
428 if thisType == long:
429 thisType = int
430
Skip Montanaro04ae7052003-04-24 20:21:31 +0000431 if thisType != columnTypes[col]:
432 if columnTypes[col] is None: # add new column type
433 columnTypes[col] = thisType
434 else:
435 # type is inconsistent, remove column from
436 # consideration
437 del columnTypes[col]
438
439 # finally, compare results against first row and "vote"
440 # on whether it's a header
441 hasHeader = 0
442 for col, colType in columnTypes.items():
443 if type(colType) == type(0): # it's a length
444 if len(header[col]) != colType:
445 hasHeader += 1
446 else:
447 hasHeader -= 1
448 else: # attempt typecast
449 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000450 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000451 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000452 hasHeader += 1
453 else:
454 hasHeader -= 1
455
456 return hasHeader > 0