blob: 4f6e5c03a27bcce4dec8ab139a0a0743953bd925 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
Brett Cannon9fc56312008-08-09 23:39:11 +00007from functools import reduce
Skip Montanaro04ae7052003-04-24 20:21:31 +00008from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +000010 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000011 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000013from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000014
Skip Montanaro1448d472003-04-25 14:47:16 +000015try:
16 from cStringIO import StringIO
17except ImportError:
18 from StringIO import StringIO
19
Skip Montanaro04ae7052003-04-24 20:21:31 +000020__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
22 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
23 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
24
25class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000026 """Describe an Excel dialect.
27
28 This must be subclassed (see csv.excel). Valid attributes are:
29 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
30 lineterminator, quoting.
31
32 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000033 _name = ""
34 _valid = False
35 # placeholders
36 delimiter = None
37 quotechar = None
38 escapechar = None
39 doublequote = None
40 skipinitialspace = None
41 lineterminator = None
42 quoting = None
43
44 def __init__(self):
45 if self.__class__ != Dialect:
46 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000047 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000048
49 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000050 try:
51 _Dialect(self)
52 except TypeError, e:
53 # We do this for compatibility with py2.3
54 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000055
56class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000057 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000058 delimiter = ','
59 quotechar = '"'
60 doublequote = True
61 skipinitialspace = False
62 lineterminator = '\r\n'
63 quoting = QUOTE_MINIMAL
64register_dialect("excel", excel)
65
66class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000067 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000068 delimiter = '\t'
69register_dialect("excel-tab", excel_tab)
70
71
72class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000073 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000074 dialect="excel", *args, **kwds):
Skip Montanaroa032bf42008-08-08 22:52:51 +000075 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000076 self.restkey = restkey # key to catch long rows
77 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000078 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000079 self.dialect = dialect
80 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000081
82 def __iter__(self):
83 return self
84
Skip Montanaroa032bf42008-08-08 22:52:51 +000085 @property
86 def fieldnames(self):
87 if self._fieldnames is None:
88 try:
89 self._fieldnames = self.reader.next()
90 except StopIteration:
91 pass
92 self.line_num = self.reader.line_num
93 return self._fieldnames
94
95 @fieldnames.setter
96 def fieldnames(self, value):
97 self._fieldnames = value
98
Skip Montanaro04ae7052003-04-24 20:21:31 +000099 def next(self):
Skip Montanaroa032bf42008-08-08 22:52:51 +0000100 if self.line_num == 0:
101 # Used only for its side effect.
102 self.fieldnames
Skip Montanaro04ae7052003-04-24 20:21:31 +0000103 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +0000104 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000105
Skip Montanaro04ae7052003-04-24 20:21:31 +0000106 # unlike the basic reader, we prefer not to return blanks,
107 # because we will typically wind up with a dict full of None
108 # values
109 while row == []:
110 row = self.reader.next()
111 d = dict(zip(self.fieldnames, row))
112 lf = len(self.fieldnames)
113 lr = len(row)
114 if lf < lr:
115 d[self.restkey] = row[lf:]
116 elif lf > lr:
117 for key in self.fieldnames[lr:]:
118 d[key] = self.restval
119 return d
120
121
122class DictWriter:
123 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000124 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000125 self.fieldnames = fieldnames # list of keys for the dict
126 self.restval = restval # for writing short dicts
127 if extrasaction.lower() not in ("raise", "ignore"):
128 raise ValueError, \
129 ("extrasaction (%s) must be 'raise' or 'ignore'" %
130 extrasaction)
131 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000132 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000133
134 def _dict_to_list(self, rowdict):
135 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000136 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
137 if wrong_fields:
138 raise ValueError("dict contains fields not in fieldnames: " +
139 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
141
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
144
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
150
Raymond Hettinger39a55922003-06-12 03:01:55 +0000151# Guard Sniffer's type checking against builds that exclude complex()
152try:
153 complex
154except NameError:
155 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156
157class Sniffer:
158 '''
159 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000160 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 # in case there is more than one possible delimiter
164 self.preferred = [',', '\t', ';', ' ', ':']
165
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
Skip Montanaro77892372003-05-19 15:33:36 +0000167 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
172 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000173 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000174 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000175 delimiter, skipinitialspace = self._guess_delimiter(sample,
176 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177
Skip Montanaro39b29be2005-12-30 05:09:48 +0000178 if not delimiter:
179 raise Error, "Could not determine delimiter"
180
Skip Montanaro1448d472003-04-25 14:47:16 +0000181 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182 _name = "sniffed"
183 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000184 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000185 # escapechar = ''
186 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000187
Skip Montanaro1448d472003-04-25 14:47:16 +0000188 dialect.delimiter = delimiter
189 # _csv.reader won't accept a quotechar of ''
190 dialect.quotechar = quotechar or '"'
191 dialect.skipinitialspace = skipinitialspace
192
193 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000194
195
Skip Montanaro77892372003-05-19 15:33:36 +0000196 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000197 """
198 Looks for text enclosed between two identical quotes
199 (the probable quotechar) which are preceded and followed
200 by the same character (the probable delimiter).
201 For example:
202 ,'some text',
203 The quote with the most wins, same with the delimiter.
204 If there is no quotechar the delimiter can't be determined
205 this way.
206 """
207
208 matches = []
209 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
211 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
212 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000213 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000214 matches = regexp.findall(data)
215 if matches:
216 break
217
218 if not matches:
219 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
220
221 quotes = {}
222 delims = {}
223 spaces = 0
224 for m in matches:
225 n = regexp.groupindex['quote'] - 1
226 key = m[n]
227 if key:
228 quotes[key] = quotes.get(key, 0) + 1
229 try:
230 n = regexp.groupindex['delim'] - 1
231 key = m[n]
232 except KeyError:
233 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000234 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000235 delims[key] = delims.get(key, 0) + 1
236 try:
237 n = regexp.groupindex['space'] - 1
238 except KeyError:
239 continue
240 if m[n]:
241 spaces += 1
242
243 quotechar = reduce(lambda a, b, quotes = quotes:
244 (quotes[a] > quotes[b]) and a or b, quotes.keys())
245
246 if delims:
247 delim = reduce(lambda a, b, delims = delims:
248 (delims[a] > delims[b]) and a or b, delims.keys())
249 skipinitialspace = delims[delim] == spaces
250 if delim == '\n': # most likely a file with a single column
251 delim = ''
252 else:
253 # there is *no* delimiter, it's a single column of quoted data
254 delim = ''
255 skipinitialspace = 0
256
257 return (quotechar, delim, skipinitialspace)
258
259
Skip Montanaro77892372003-05-19 15:33:36 +0000260 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000261 """
262 The delimiter /should/ occur the same number of times on
263 each row. However, due to malformed data, it may not. We don't want
264 an all or nothing approach, so we allow for small variations in this
265 number.
266 1) build a table of the frequency of each character on every line.
267 2) build a table of freqencies of this frequency (meta-frequency?),
268 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
269 7 times in 2 rows'
270 3) use the mode of the meta-frequency to determine the /expected/
271 frequency for that character
272 4) find out how often the character actually meets that goal
273 5) the character that best meets its goal is the delimiter
274 For performance reasons, the data is evaluated in chunks, so it can
275 try and evaluate the smallest portion of the data possible, evaluating
276 additional chunks as necessary.
277 """
278
279 data = filter(None, data.split('\n'))
280
281 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
282
283 # build frequency tables
284 chunkLength = min(10, len(data))
285 iteration = 0
286 charFrequency = {}
287 modes = {}
288 delims = {}
289 start, end = 0, min(chunkLength, len(data))
290 while start < len(data):
291 iteration += 1
292 for line in data[start:end]:
293 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000294 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000295 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000296 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000297 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000298 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
299 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000300
301 for char in charFrequency.keys():
302 items = charFrequency[char].items()
303 if len(items) == 1 and items[0][0] == 0:
304 continue
305 # get the mode of the frequencies
306 if len(items) > 1:
307 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
308 items)
309 # adjust the mode - subtract the sum of all
310 # other frequencies
311 items.remove(modes[char])
312 modes[char] = (modes[char][0], modes[char][1]
313 - reduce(lambda a, b: (0, a[1] + b[1]),
314 items)[1])
315 else:
316 modes[char] = items[0]
317
318 # build a list of possible delimiters
319 modeList = modes.items()
320 total = float(chunkLength * iteration)
321 # (rows of consistent data) / (number of rows) = 100%
322 consistency = 1.0
323 # minimum consistency threshold
324 threshold = 0.9
325 while len(delims) == 0 and consistency >= threshold:
326 for k, v in modeList:
327 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000328 if ((v[1]/total) >= consistency and
329 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000330 delims[k] = v
331 consistency -= 0.01
332
333 if len(delims) == 1:
334 delim = delims.keys()[0]
335 skipinitialspace = (data[0].count(delim) ==
336 data[0].count("%c " % delim))
337 return (delim, skipinitialspace)
338
339 # analyze another chunkLength lines
340 start = end
341 end += chunkLength
342
343 if not delims:
344 return ('', 0)
345
346 # if there's more than one, fall back to a 'preferred' list
347 if len(delims) > 1:
348 for d in self.preferred:
349 if d in delims.keys():
350 skipinitialspace = (data[0].count(d) ==
351 data[0].count("%c " % d))
352 return (d, skipinitialspace)
353
Skip Montanaro39b29be2005-12-30 05:09:48 +0000354 # nothing else indicates a preference, pick the character that
355 # dominates(?)
356 items = [(v,k) for (k,v) in delims.items()]
357 items.sort()
358 delim = items[-1][1]
359
Skip Montanaro04ae7052003-04-24 20:21:31 +0000360 skipinitialspace = (data[0].count(delim) ==
361 data[0].count("%c " % delim))
362 return (delim, skipinitialspace)
363
364
Skip Montanaro1448d472003-04-25 14:47:16 +0000365 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000366 # Creates a dictionary of types of data in each column. If any
367 # column is of a single type (say, integers), *except* for the first
368 # row, then the first row is presumed to be labels. If the type
369 # can't be determined, it is assumed to be a string in which case
370 # the length of the string is the determining factor: if all of the
371 # rows except for the first are the same length, it's a header.
372 # Finally, a 'vote' is taken at the end for each column, adding or
373 # subtracting from the likelihood of the first row being a header.
374
Skip Montanaro1448d472003-04-25 14:47:16 +0000375 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000376
Skip Montanaro1448d472003-04-25 14:47:16 +0000377 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000378
379 columns = len(header)
380 columnTypes = {}
381 for i in range(columns): columnTypes[i] = None
382
383 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000384 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000385 # arbitrary number of rows to check, to keep it sane
386 if checked > 20:
387 break
388 checked += 1
389
390 if len(row) != columns:
391 continue # skip rows that have irregular number of columns
392
393 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000394
395 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000396 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000397 thisType(row[col])
398 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000399 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000400 pass
401 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000402 # fallback to length of string
403 thisType = len(row[col])
404
Raymond Hettinger39a55922003-06-12 03:01:55 +0000405 # treat longs as ints
406 if thisType == long:
407 thisType = int
408
Skip Montanaro04ae7052003-04-24 20:21:31 +0000409 if thisType != columnTypes[col]:
410 if columnTypes[col] is None: # add new column type
411 columnTypes[col] = thisType
412 else:
413 # type is inconsistent, remove column from
414 # consideration
415 del columnTypes[col]
416
417 # finally, compare results against first row and "vote"
418 # on whether it's a header
419 hasHeader = 0
420 for col, colType in columnTypes.items():
421 if type(colType) == type(0): # it's a length
422 if len(header[col]) != colType:
423 hasHeader += 1
424 else:
425 hasHeader -= 1
426 else: # attempt typecast
427 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000428 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000429 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000430 hasHeader += 1
431 else:
432 hasHeader -= 1
433
434 return hasHeader > 0