blob: 984ed7e581b9a7ac15d47f4711081d06de1e67cf [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
Brett Cannon9fc56312008-08-09 23:39:11 +00007from functools import reduce
Skip Montanaro04ae7052003-04-24 20:21:31 +00008from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +000010 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000011 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000013from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000014
Skip Montanaro1448d472003-04-25 14:47:16 +000015try:
16 from cStringIO import StringIO
17except ImportError:
18 from StringIO import StringIO
19
Skip Montanaro04ae7052003-04-24 20:21:31 +000020__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannon88f801d2008-08-18 00:46:22 +000021 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000023 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000027 """Describe an Excel dialect.
28
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
32
33 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000034 _name = ""
35 _valid = False
36 # placeholders
37 delimiter = None
38 quotechar = None
39 escapechar = None
40 doublequote = None
41 skipinitialspace = None
42 lineterminator = None
43 quoting = None
44
45 def __init__(self):
46 if self.__class__ != Dialect:
47 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000048 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000049
50 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000051 try:
52 _Dialect(self)
53 except TypeError, e:
54 # We do this for compatibility with py2.3
55 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000056
57class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000058 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000059 delimiter = ','
60 quotechar = '"'
61 doublequote = True
62 skipinitialspace = False
63 lineterminator = '\r\n'
64 quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000068 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000069 delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000074 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 dialect="excel", *args, **kwds):
Skip Montanaroa032bf42008-08-08 22:52:51 +000076 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000077 self.restkey = restkey # key to catch long rows
78 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000079 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000080 self.dialect = dialect
81 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000082
83 def __iter__(self):
84 return self
85
Skip Montanaroa032bf42008-08-08 22:52:51 +000086 @property
87 def fieldnames(self):
88 if self._fieldnames is None:
89 try:
90 self._fieldnames = self.reader.next()
91 except StopIteration:
92 pass
93 self.line_num = self.reader.line_num
94 return self._fieldnames
95
96 @fieldnames.setter
97 def fieldnames(self, value):
98 self._fieldnames = value
99
Skip Montanaro04ae7052003-04-24 20:21:31 +0000100 def next(self):
Skip Montanaroa032bf42008-08-08 22:52:51 +0000101 if self.line_num == 0:
102 # Used only for its side effect.
103 self.fieldnames
Skip Montanaro04ae7052003-04-24 20:21:31 +0000104 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +0000105 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000106
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000125 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000133 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000134
Dirkjan Ochtman86148172010-02-23 21:09:52 +0000135 def writeheader(self):
136 header = dict(zip(self.fieldnames, self.fieldnames))
137 self.writerow(header)
138
Skip Montanaro04ae7052003-04-24 20:21:31 +0000139 def _dict_to_list(self, rowdict):
140 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000141 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
142 if wrong_fields:
143 raise ValueError("dict contains fields not in fieldnames: " +
144 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000145 return [rowdict.get(key, self.restval) for key in self.fieldnames]
146
147 def writerow(self, rowdict):
148 return self.writer.writerow(self._dict_to_list(rowdict))
149
150 def writerows(self, rowdicts):
151 rows = []
152 for rowdict in rowdicts:
153 rows.append(self._dict_to_list(rowdict))
154 return self.writer.writerows(rows)
155
Raymond Hettinger39a55922003-06-12 03:01:55 +0000156# Guard Sniffer's type checking against builds that exclude complex()
157try:
158 complex
159except NameError:
160 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161
162class Sniffer:
163 '''
164 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000165 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000167 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168 # in case there is more than one possible delimiter
169 self.preferred = [',', '\t', ';', ' ', ':']
170
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
Skip Montanaro77892372003-05-19 15:33:36 +0000172 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000173 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000174 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000177 quotechar, doublequote, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000178 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000179 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000180 delimiter, skipinitialspace = self._guess_delimiter(sample,
181 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182
Skip Montanaro39b29be2005-12-30 05:09:48 +0000183 if not delimiter:
184 raise Error, "Could not determine delimiter"
185
Skip Montanaro1448d472003-04-25 14:47:16 +0000186 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000187 _name = "sniffed"
188 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000189 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000190 # escapechar = ''
Skip Montanaro04ae7052003-04-24 20:21:31 +0000191
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000192 dialect.doublequote = doublequote
Skip Montanaro1448d472003-04-25 14:47:16 +0000193 dialect.delimiter = delimiter
194 # _csv.reader won't accept a quotechar of ''
195 dialect.quotechar = quotechar or '"'
196 dialect.skipinitialspace = skipinitialspace
197
198 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000199
200
Skip Montanaro77892372003-05-19 15:33:36 +0000201 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000202 """
203 Looks for text enclosed between two identical quotes
204 (the probable quotechar) which are preceded and followed
205 by the same character (the probable delimiter).
206 For example:
207 ,'some text',
208 The quote with the most wins, same with the delimiter.
209 If there is no quotechar the delimiter can't be determined
210 this way.
211 """
212
213 matches = []
214 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
215 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
216 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
217 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000218 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000219 matches = regexp.findall(data)
220 if matches:
221 break
222
223 if not matches:
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000224 # (quotechar, doublequote, delimiter, skipinitialspace)
225 return ('', False, None, 0)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000226 quotes = {}
227 delims = {}
228 spaces = 0
229 for m in matches:
230 n = regexp.groupindex['quote'] - 1
231 key = m[n]
232 if key:
233 quotes[key] = quotes.get(key, 0) + 1
234 try:
235 n = regexp.groupindex['delim'] - 1
236 key = m[n]
237 except KeyError:
238 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000239 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000240 delims[key] = delims.get(key, 0) + 1
241 try:
242 n = regexp.groupindex['space'] - 1
243 except KeyError:
244 continue
245 if m[n]:
246 spaces += 1
247
248 quotechar = reduce(lambda a, b, quotes = quotes:
249 (quotes[a] > quotes[b]) and a or b, quotes.keys())
250
251 if delims:
252 delim = reduce(lambda a, b, delims = delims:
253 (delims[a] > delims[b]) and a or b, delims.keys())
254 skipinitialspace = delims[delim] == spaces
255 if delim == '\n': # most likely a file with a single column
256 delim = ''
257 else:
258 # there is *no* delimiter, it's a single column of quoted data
259 delim = ''
260 skipinitialspace = 0
261
Skip Montanarob4fd4d32009-09-28 02:12:27 +0000262 # if we see an extra quote between delimiters, we've got a
263 # double quoted format
264 dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
265 {'delim':delim, 'quote':quotechar}, re.MULTILINE)
266
267
268
269 if dq_regexp.search(data):
270 doublequote = True
271 else:
272 doublequote = False
273
274 return (quotechar, doublequote, delim, skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000275
276
Skip Montanaro77892372003-05-19 15:33:36 +0000277 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000278 """
279 The delimiter /should/ occur the same number of times on
280 each row. However, due to malformed data, it may not. We don't want
281 an all or nothing approach, so we allow for small variations in this
282 number.
283 1) build a table of the frequency of each character on every line.
Ezio Melottic2077b02011-03-16 12:34:31 +0200284 2) build a table of frequencies of this frequency (meta-frequency?),
Skip Montanaro04ae7052003-04-24 20:21:31 +0000285 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
286 7 times in 2 rows'
287 3) use the mode of the meta-frequency to determine the /expected/
288 frequency for that character
289 4) find out how often the character actually meets that goal
290 5) the character that best meets its goal is the delimiter
291 For performance reasons, the data is evaluated in chunks, so it can
292 try and evaluate the smallest portion of the data possible, evaluating
293 additional chunks as necessary.
294 """
295
296 data = filter(None, data.split('\n'))
297
298 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
299
300 # build frequency tables
301 chunkLength = min(10, len(data))
302 iteration = 0
303 charFrequency = {}
304 modes = {}
305 delims = {}
306 start, end = 0, min(chunkLength, len(data))
307 while start < len(data):
308 iteration += 1
309 for line in data[start:end]:
310 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000311 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000312 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000313 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000314 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000315 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
316 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000317
318 for char in charFrequency.keys():
319 items = charFrequency[char].items()
320 if len(items) == 1 and items[0][0] == 0:
321 continue
322 # get the mode of the frequencies
323 if len(items) > 1:
324 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
325 items)
326 # adjust the mode - subtract the sum of all
327 # other frequencies
328 items.remove(modes[char])
329 modes[char] = (modes[char][0], modes[char][1]
330 - reduce(lambda a, b: (0, a[1] + b[1]),
331 items)[1])
332 else:
333 modes[char] = items[0]
334
335 # build a list of possible delimiters
336 modeList = modes.items()
337 total = float(chunkLength * iteration)
338 # (rows of consistent data) / (number of rows) = 100%
339 consistency = 1.0
340 # minimum consistency threshold
341 threshold = 0.9
342 while len(delims) == 0 and consistency >= threshold:
343 for k, v in modeList:
344 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000345 if ((v[1]/total) >= consistency and
346 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000347 delims[k] = v
348 consistency -= 0.01
349
350 if len(delims) == 1:
351 delim = delims.keys()[0]
352 skipinitialspace = (data[0].count(delim) ==
353 data[0].count("%c " % delim))
354 return (delim, skipinitialspace)
355
356 # analyze another chunkLength lines
357 start = end
358 end += chunkLength
359
360 if not delims:
361 return ('', 0)
362
363 # if there's more than one, fall back to a 'preferred' list
364 if len(delims) > 1:
365 for d in self.preferred:
366 if d in delims.keys():
367 skipinitialspace = (data[0].count(d) ==
368 data[0].count("%c " % d))
369 return (d, skipinitialspace)
370
Skip Montanaro39b29be2005-12-30 05:09:48 +0000371 # nothing else indicates a preference, pick the character that
372 # dominates(?)
373 items = [(v,k) for (k,v) in delims.items()]
374 items.sort()
375 delim = items[-1][1]
376
Skip Montanaro04ae7052003-04-24 20:21:31 +0000377 skipinitialspace = (data[0].count(delim) ==
378 data[0].count("%c " % delim))
379 return (delim, skipinitialspace)
380
381
Skip Montanaro1448d472003-04-25 14:47:16 +0000382 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000383 # Creates a dictionary of types of data in each column. If any
384 # column is of a single type (say, integers), *except* for the first
385 # row, then the first row is presumed to be labels. If the type
386 # can't be determined, it is assumed to be a string in which case
387 # the length of the string is the determining factor: if all of the
388 # rows except for the first are the same length, it's a header.
389 # Finally, a 'vote' is taken at the end for each column, adding or
390 # subtracting from the likelihood of the first row being a header.
391
Skip Montanaro1448d472003-04-25 14:47:16 +0000392 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393
Skip Montanaro1448d472003-04-25 14:47:16 +0000394 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000395
396 columns = len(header)
397 columnTypes = {}
398 for i in range(columns): columnTypes[i] = None
399
400 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000401 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000402 # arbitrary number of rows to check, to keep it sane
403 if checked > 20:
404 break
405 checked += 1
406
407 if len(row) != columns:
408 continue # skip rows that have irregular number of columns
409
410 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000411
412 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000413 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000414 thisType(row[col])
415 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000416 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000417 pass
418 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000419 # fallback to length of string
420 thisType = len(row[col])
421
Raymond Hettinger39a55922003-06-12 03:01:55 +0000422 # treat longs as ints
423 if thisType == long:
424 thisType = int
425
Skip Montanaro04ae7052003-04-24 20:21:31 +0000426 if thisType != columnTypes[col]:
427 if columnTypes[col] is None: # add new column type
428 columnTypes[col] = thisType
429 else:
430 # type is inconsistent, remove column from
431 # consideration
432 del columnTypes[col]
433
434 # finally, compare results against first row and "vote"
435 # on whether it's a header
436 hasHeader = 0
437 for col, colType in columnTypes.items():
438 if type(colType) == type(0): # it's a length
439 if len(header[col]) != colType:
440 hasHeader += 1
441 else:
442 hasHeader -= 1
443 else: # attempt typecast
444 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000445 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000446 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000447 hasHeader += 1
448 else:
449 hasHeader -= 1
450
451 return hasHeader > 0