blob: 7210dacf9480d2f653f98d43d3a15509f978daf1 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaroa032bf42008-08-08 22:52:51 +000074 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000075 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000078 self.dialect = dialect
79 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000080
81 def __iter__(self):
82 return self
83
Skip Montanaroa032bf42008-08-08 22:52:51 +000084 @property
85 def fieldnames(self):
86 if self._fieldnames is None:
87 try:
88 self._fieldnames = self.reader.next()
89 except StopIteration:
90 pass
91 self.line_num = self.reader.line_num
92 return self._fieldnames
93
94 @fieldnames.setter
95 def fieldnames(self, value):
96 self._fieldnames = value
97
Skip Montanaro04ae7052003-04-24 20:21:31 +000098 def next(self):
Skip Montanaroa032bf42008-08-08 22:52:51 +000099 if self.line_num == 0:
100 # Used only for its side effect.
101 self.fieldnames
Skip Montanaro04ae7052003-04-24 20:21:31 +0000102 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +0000103 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000104
Skip Montanaro04ae7052003-04-24 20:21:31 +0000105 # unlike the basic reader, we prefer not to return blanks,
106 # because we will typically wind up with a dict full of None
107 # values
108 while row == []:
109 row = self.reader.next()
110 d = dict(zip(self.fieldnames, row))
111 lf = len(self.fieldnames)
112 lr = len(row)
113 if lf < lr:
114 d[self.restkey] = row[lf:]
115 elif lf > lr:
116 for key in self.fieldnames[lr:]:
117 d[key] = self.restval
118 return d
119
120
121class DictWriter:
122 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000123 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000124 self.fieldnames = fieldnames # list of keys for the dict
125 self.restval = restval # for writing short dicts
126 if extrasaction.lower() not in ("raise", "ignore"):
127 raise ValueError, \
128 ("extrasaction (%s) must be 'raise' or 'ignore'" %
129 extrasaction)
130 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000131 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000132
133 def _dict_to_list(self, rowdict):
134 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000135 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
136 if wrong_fields:
137 raise ValueError("dict contains fields not in fieldnames: " +
138 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000139 return [rowdict.get(key, self.restval) for key in self.fieldnames]
140
141 def writerow(self, rowdict):
142 return self.writer.writerow(self._dict_to_list(rowdict))
143
144 def writerows(self, rowdicts):
145 rows = []
146 for rowdict in rowdicts:
147 rows.append(self._dict_to_list(rowdict))
148 return self.writer.writerows(rows)
149
Raymond Hettinger39a55922003-06-12 03:01:55 +0000150# Guard Sniffer's type checking against builds that exclude complex()
151try:
152 complex
153except NameError:
154 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000155
156class Sniffer:
157 '''
158 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000159 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000160 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000161 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162 # in case there is more than one possible delimiter
163 self.preferred = [',', '\t', ';', ' ', ':']
164
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165
Skip Montanaro77892372003-05-19 15:33:36 +0000166 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000168 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170
171 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000172 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000173 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000174 delimiter, skipinitialspace = self._guess_delimiter(sample,
175 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176
Skip Montanaro39b29be2005-12-30 05:09:48 +0000177 if not delimiter:
178 raise Error, "Could not determine delimiter"
179
Skip Montanaro1448d472003-04-25 14:47:16 +0000180 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181 _name = "sniffed"
182 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000183 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000184 # escapechar = ''
185 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186
Skip Montanaro1448d472003-04-25 14:47:16 +0000187 dialect.delimiter = delimiter
188 # _csv.reader won't accept a quotechar of ''
189 dialect.quotechar = quotechar or '"'
190 dialect.skipinitialspace = skipinitialspace
191
192 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000193
194
Skip Montanaro77892372003-05-19 15:33:36 +0000195 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000196 """
197 Looks for text enclosed between two identical quotes
198 (the probable quotechar) which are preceded and followed
199 by the same character (the probable delimiter).
200 For example:
201 ,'some text',
202 The quote with the most wins, same with the delimiter.
203 If there is no quotechar the delimiter can't be determined
204 this way.
205 """
206
207 matches = []
208 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
209 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
210 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000212 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000213 matches = regexp.findall(data)
214 if matches:
215 break
216
217 if not matches:
218 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
219
220 quotes = {}
221 delims = {}
222 spaces = 0
223 for m in matches:
224 n = regexp.groupindex['quote'] - 1
225 key = m[n]
226 if key:
227 quotes[key] = quotes.get(key, 0) + 1
228 try:
229 n = regexp.groupindex['delim'] - 1
230 key = m[n]
231 except KeyError:
232 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000233 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000234 delims[key] = delims.get(key, 0) + 1
235 try:
236 n = regexp.groupindex['space'] - 1
237 except KeyError:
238 continue
239 if m[n]:
240 spaces += 1
241
242 quotechar = reduce(lambda a, b, quotes = quotes:
243 (quotes[a] > quotes[b]) and a or b, quotes.keys())
244
245 if delims:
246 delim = reduce(lambda a, b, delims = delims:
247 (delims[a] > delims[b]) and a or b, delims.keys())
248 skipinitialspace = delims[delim] == spaces
249 if delim == '\n': # most likely a file with a single column
250 delim = ''
251 else:
252 # there is *no* delimiter, it's a single column of quoted data
253 delim = ''
254 skipinitialspace = 0
255
256 return (quotechar, delim, skipinitialspace)
257
258
Skip Montanaro77892372003-05-19 15:33:36 +0000259 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000260 """
261 The delimiter /should/ occur the same number of times on
262 each row. However, due to malformed data, it may not. We don't want
263 an all or nothing approach, so we allow for small variations in this
264 number.
265 1) build a table of the frequency of each character on every line.
266 2) build a table of freqencies of this frequency (meta-frequency?),
267 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
268 7 times in 2 rows'
269 3) use the mode of the meta-frequency to determine the /expected/
270 frequency for that character
271 4) find out how often the character actually meets that goal
272 5) the character that best meets its goal is the delimiter
273 For performance reasons, the data is evaluated in chunks, so it can
274 try and evaluate the smallest portion of the data possible, evaluating
275 additional chunks as necessary.
276 """
277
278 data = filter(None, data.split('\n'))
279
280 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
281
282 # build frequency tables
283 chunkLength = min(10, len(data))
284 iteration = 0
285 charFrequency = {}
286 modes = {}
287 delims = {}
288 start, end = 0, min(chunkLength, len(data))
289 while start < len(data):
290 iteration += 1
291 for line in data[start:end]:
292 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000293 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000294 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000295 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000296 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000297 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
298 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000299
300 for char in charFrequency.keys():
301 items = charFrequency[char].items()
302 if len(items) == 1 and items[0][0] == 0:
303 continue
304 # get the mode of the frequencies
305 if len(items) > 1:
306 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
307 items)
308 # adjust the mode - subtract the sum of all
309 # other frequencies
310 items.remove(modes[char])
311 modes[char] = (modes[char][0], modes[char][1]
312 - reduce(lambda a, b: (0, a[1] + b[1]),
313 items)[1])
314 else:
315 modes[char] = items[0]
316
317 # build a list of possible delimiters
318 modeList = modes.items()
319 total = float(chunkLength * iteration)
320 # (rows of consistent data) / (number of rows) = 100%
321 consistency = 1.0
322 # minimum consistency threshold
323 threshold = 0.9
324 while len(delims) == 0 and consistency >= threshold:
325 for k, v in modeList:
326 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000327 if ((v[1]/total) >= consistency and
328 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000329 delims[k] = v
330 consistency -= 0.01
331
332 if len(delims) == 1:
333 delim = delims.keys()[0]
334 skipinitialspace = (data[0].count(delim) ==
335 data[0].count("%c " % delim))
336 return (delim, skipinitialspace)
337
338 # analyze another chunkLength lines
339 start = end
340 end += chunkLength
341
342 if not delims:
343 return ('', 0)
344
345 # if there's more than one, fall back to a 'preferred' list
346 if len(delims) > 1:
347 for d in self.preferred:
348 if d in delims.keys():
349 skipinitialspace = (data[0].count(d) ==
350 data[0].count("%c " % d))
351 return (d, skipinitialspace)
352
Skip Montanaro39b29be2005-12-30 05:09:48 +0000353 # nothing else indicates a preference, pick the character that
354 # dominates(?)
355 items = [(v,k) for (k,v) in delims.items()]
356 items.sort()
357 delim = items[-1][1]
358
Skip Montanaro04ae7052003-04-24 20:21:31 +0000359 skipinitialspace = (data[0].count(delim) ==
360 data[0].count("%c " % delim))
361 return (delim, skipinitialspace)
362
363
Skip Montanaro1448d472003-04-25 14:47:16 +0000364 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000365 # Creates a dictionary of types of data in each column. If any
366 # column is of a single type (say, integers), *except* for the first
367 # row, then the first row is presumed to be labels. If the type
368 # can't be determined, it is assumed to be a string in which case
369 # the length of the string is the determining factor: if all of the
370 # rows except for the first are the same length, it's a header.
371 # Finally, a 'vote' is taken at the end for each column, adding or
372 # subtracting from the likelihood of the first row being a header.
373
Skip Montanaro1448d472003-04-25 14:47:16 +0000374 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000375
Skip Montanaro1448d472003-04-25 14:47:16 +0000376 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000377
378 columns = len(header)
379 columnTypes = {}
380 for i in range(columns): columnTypes[i] = None
381
382 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000383 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000384 # arbitrary number of rows to check, to keep it sane
385 if checked > 20:
386 break
387 checked += 1
388
389 if len(row) != columns:
390 continue # skip rows that have irregular number of columns
391
392 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000393
394 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000395 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000396 thisType(row[col])
397 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000398 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000399 pass
400 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000401 # fallback to length of string
402 thisType = len(row[col])
403
Raymond Hettinger39a55922003-06-12 03:01:55 +0000404 # treat longs as ints
405 if thisType == long:
406 thisType = int
407
Skip Montanaro04ae7052003-04-24 20:21:31 +0000408 if thisType != columnTypes[col]:
409 if columnTypes[col] is None: # add new column type
410 columnTypes[col] = thisType
411 else:
412 # type is inconsistent, remove column from
413 # consideration
414 del columnTypes[col]
415
416 # finally, compare results against first row and "vote"
417 # on whether it's a header
418 hasHeader = 0
419 for col, colType in columnTypes.items():
420 if type(colType) == type(0): # it's a length
421 if len(header[col]) != colType:
422 hasHeader += 1
423 else:
424 hasHeader -= 1
425 else: # attempt typecast
426 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000427 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000428 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000429 hasHeader += 1
430 else:
431 hasHeader -= 1
432
433 return hasHeader > 0