blob: d08a86b6be888300ece7ea5c8c7be7b08171086f [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
Skip Montanaro1448d472003-04-25 14:47:16 +000012try:
13 from cStringIO import StringIO
14except ImportError:
15 from StringIO import StringIO
16
Skip Montanaro04ae7052003-04-24 20:21:31 +000017__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000023 """Describe an Excel dialect.
24
25 This must be subclassed (see csv.excel). Valid attributes are:
26 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
27 lineterminator, quoting.
28
29 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000030 _name = ""
31 _valid = False
32 # placeholders
33 delimiter = None
34 quotechar = None
35 escapechar = None
36 doublequote = None
37 skipinitialspace = None
38 lineterminator = None
39 quoting = None
40
41 def __init__(self):
42 if self.__class__ != Dialect:
43 self._valid = True
44 errors = self._validate()
45 if errors != []:
46 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
47
48 def _validate(self):
49 errors = []
50 if not self._valid:
51 errors.append("can't directly instantiate Dialect class")
52
53 if self.delimiter is None:
54 errors.append("delimiter character not set")
55 elif (not isinstance(self.delimiter, str) or
56 len(self.delimiter) > 1):
57 errors.append("delimiter must be one-character string")
58
59 if self.quotechar is None:
60 if self.quoting != QUOTE_NONE:
61 errors.append("quotechar not set")
62 elif (not isinstance(self.quotechar, str) or
63 len(self.quotechar) > 1):
64 errors.append("quotechar must be one-character string")
65
66 if self.lineterminator is None:
67 errors.append("lineterminator not set")
68 elif not isinstance(self.lineterminator, str):
69 errors.append("lineterminator must be a string")
70
Andrew McNamara1196cf12005-01-07 04:42:45 +000071 if self.doublequote not in (True, False) and self.quoting != QUOTE_NONE:
Skip Montanaro04ae7052003-04-24 20:21:31 +000072 errors.append("doublequote parameter must be True or False")
73
74 if self.skipinitialspace not in (True, False):
75 errors.append("skipinitialspace parameter must be True or False")
76
77 if self.quoting is None:
78 errors.append("quoting parameter not set")
79
80 if self.quoting is QUOTE_NONE:
81 if (not isinstance(self.escapechar, (unicode, str)) or
82 len(self.escapechar) > 1):
83 errors.append("escapechar must be a one-character string or unicode object")
84
85 return errors
86
87class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000088 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000089 delimiter = ','
90 quotechar = '"'
91 doublequote = True
92 skipinitialspace = False
93 lineterminator = '\r\n'
94 quoting = QUOTE_MINIMAL
95register_dialect("excel", excel)
96
97class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000098 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000099 delimiter = '\t'
100register_dialect("excel-tab", excel_tab)
101
102
103class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +0000104 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000105 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000106 self.fieldnames = fieldnames # list of keys for the dict
107 self.restkey = restkey # key to catch long rows
108 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000109 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000110
111 def __iter__(self):
112 return self
113
114 def next(self):
115 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +0000116 if self.fieldnames is None:
117 self.fieldnames = row
118 row = self.reader.next()
119
Skip Montanaro04ae7052003-04-24 20:21:31 +0000120 # unlike the basic reader, we prefer not to return blanks,
121 # because we will typically wind up with a dict full of None
122 # values
123 while row == []:
124 row = self.reader.next()
125 d = dict(zip(self.fieldnames, row))
126 lf = len(self.fieldnames)
127 lr = len(row)
128 if lf < lr:
129 d[self.restkey] = row[lf:]
130 elif lf > lr:
131 for key in self.fieldnames[lr:]:
132 d[key] = self.restval
133 return d
134
135
136class DictWriter:
137 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000138 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000139 self.fieldnames = fieldnames # list of keys for the dict
140 self.restval = restval # for writing short dicts
141 if extrasaction.lower() not in ("raise", "ignore"):
142 raise ValueError, \
143 ("extrasaction (%s) must be 'raise' or 'ignore'" %
144 extrasaction)
145 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000146 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147
148 def _dict_to_list(self, rowdict):
149 if self.extrasaction == "raise":
150 for k in rowdict.keys():
151 if k not in self.fieldnames:
152 raise ValueError, "dict contains fields not in fieldnames"
153 return [rowdict.get(key, self.restval) for key in self.fieldnames]
154
155 def writerow(self, rowdict):
156 return self.writer.writerow(self._dict_to_list(rowdict))
157
158 def writerows(self, rowdicts):
159 rows = []
160 for rowdict in rowdicts:
161 rows.append(self._dict_to_list(rowdict))
162 return self.writer.writerows(rows)
163
Raymond Hettinger39a55922003-06-12 03:01:55 +0000164# Guard Sniffer's type checking against builds that exclude complex()
165try:
166 complex
167except NameError:
168 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169
170class Sniffer:
171 '''
172 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000173 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000175 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176 # in case there is more than one possible delimiter
177 self.preferred = [',', '\t', ';', ' ', ':']
178
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179
Skip Montanaro77892372003-05-19 15:33:36 +0000180 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000182 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000183 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000184
185 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000186 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000187 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000188 delimiter, skipinitialspace = self._guess_delimiter(sample,
189 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000190
Skip Montanaro1448d472003-04-25 14:47:16 +0000191 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000192 _name = "sniffed"
193 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000194 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195 # escapechar = ''
196 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000197
Skip Montanaro1448d472003-04-25 14:47:16 +0000198 dialect.delimiter = delimiter
199 # _csv.reader won't accept a quotechar of ''
200 dialect.quotechar = quotechar or '"'
201 dialect.skipinitialspace = skipinitialspace
202
203 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000204
205
Skip Montanaro77892372003-05-19 15:33:36 +0000206 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000207 """
208 Looks for text enclosed between two identical quotes
209 (the probable quotechar) which are preceded and followed
210 by the same character (the probable delimiter).
211 For example:
212 ,'some text',
213 The quote with the most wins, same with the delimiter.
214 If there is no quotechar the delimiter can't be determined
215 this way.
216 """
217
218 matches = []
219 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
220 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
221 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
222 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000223 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000224 matches = regexp.findall(data)
225 if matches:
226 break
227
228 if not matches:
229 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
230
231 quotes = {}
232 delims = {}
233 spaces = 0
234 for m in matches:
235 n = regexp.groupindex['quote'] - 1
236 key = m[n]
237 if key:
238 quotes[key] = quotes.get(key, 0) + 1
239 try:
240 n = regexp.groupindex['delim'] - 1
241 key = m[n]
242 except KeyError:
243 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000244 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000245 delims[key] = delims.get(key, 0) + 1
246 try:
247 n = regexp.groupindex['space'] - 1
248 except KeyError:
249 continue
250 if m[n]:
251 spaces += 1
252
253 quotechar = reduce(lambda a, b, quotes = quotes:
254 (quotes[a] > quotes[b]) and a or b, quotes.keys())
255
256 if delims:
257 delim = reduce(lambda a, b, delims = delims:
258 (delims[a] > delims[b]) and a or b, delims.keys())
259 skipinitialspace = delims[delim] == spaces
260 if delim == '\n': # most likely a file with a single column
261 delim = ''
262 else:
263 # there is *no* delimiter, it's a single column of quoted data
264 delim = ''
265 skipinitialspace = 0
266
267 return (quotechar, delim, skipinitialspace)
268
269
Skip Montanaro77892372003-05-19 15:33:36 +0000270 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000271 """
272 The delimiter /should/ occur the same number of times on
273 each row. However, due to malformed data, it may not. We don't want
274 an all or nothing approach, so we allow for small variations in this
275 number.
276 1) build a table of the frequency of each character on every line.
277 2) build a table of freqencies of this frequency (meta-frequency?),
278 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
279 7 times in 2 rows'
280 3) use the mode of the meta-frequency to determine the /expected/
281 frequency for that character
282 4) find out how often the character actually meets that goal
283 5) the character that best meets its goal is the delimiter
284 For performance reasons, the data is evaluated in chunks, so it can
285 try and evaluate the smallest portion of the data possible, evaluating
286 additional chunks as necessary.
287 """
288
289 data = filter(None, data.split('\n'))
290
291 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
292
293 # build frequency tables
294 chunkLength = min(10, len(data))
295 iteration = 0
296 charFrequency = {}
297 modes = {}
298 delims = {}
299 start, end = 0, min(chunkLength, len(data))
300 while start < len(data):
301 iteration += 1
302 for line in data[start:end]:
303 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000304 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000305 # must count even if frequency is 0
306 freq = line.strip().count(char)
307 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000308 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
309 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000310
311 for char in charFrequency.keys():
312 items = charFrequency[char].items()
313 if len(items) == 1 and items[0][0] == 0:
314 continue
315 # get the mode of the frequencies
316 if len(items) > 1:
317 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
318 items)
319 # adjust the mode - subtract the sum of all
320 # other frequencies
321 items.remove(modes[char])
322 modes[char] = (modes[char][0], modes[char][1]
323 - reduce(lambda a, b: (0, a[1] + b[1]),
324 items)[1])
325 else:
326 modes[char] = items[0]
327
328 # build a list of possible delimiters
329 modeList = modes.items()
330 total = float(chunkLength * iteration)
331 # (rows of consistent data) / (number of rows) = 100%
332 consistency = 1.0
333 # minimum consistency threshold
334 threshold = 0.9
335 while len(delims) == 0 and consistency >= threshold:
336 for k, v in modeList:
337 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000338 if ((v[1]/total) >= consistency and
339 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000340 delims[k] = v
341 consistency -= 0.01
342
343 if len(delims) == 1:
344 delim = delims.keys()[0]
345 skipinitialspace = (data[0].count(delim) ==
346 data[0].count("%c " % delim))
347 return (delim, skipinitialspace)
348
349 # analyze another chunkLength lines
350 start = end
351 end += chunkLength
352
353 if not delims:
354 return ('', 0)
355
356 # if there's more than one, fall back to a 'preferred' list
357 if len(delims) > 1:
358 for d in self.preferred:
359 if d in delims.keys():
360 skipinitialspace = (data[0].count(d) ==
361 data[0].count("%c " % d))
362 return (d, skipinitialspace)
363
364 # finally, just return the first damn character in the list
365 delim = delims.keys()[0]
366 skipinitialspace = (data[0].count(delim) ==
367 data[0].count("%c " % delim))
368 return (delim, skipinitialspace)
369
370
Skip Montanaro1448d472003-04-25 14:47:16 +0000371 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000372 # Creates a dictionary of types of data in each column. If any
373 # column is of a single type (say, integers), *except* for the first
374 # row, then the first row is presumed to be labels. If the type
375 # can't be determined, it is assumed to be a string in which case
376 # the length of the string is the determining factor: if all of the
377 # rows except for the first are the same length, it's a header.
378 # Finally, a 'vote' is taken at the end for each column, adding or
379 # subtracting from the likelihood of the first row being a header.
380
Skip Montanaro1448d472003-04-25 14:47:16 +0000381 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000382
Skip Montanaro1448d472003-04-25 14:47:16 +0000383 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000384
385 columns = len(header)
386 columnTypes = {}
387 for i in range(columns): columnTypes[i] = None
388
389 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000390 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000391 # arbitrary number of rows to check, to keep it sane
392 if checked > 20:
393 break
394 checked += 1
395
396 if len(row) != columns:
397 continue # skip rows that have irregular number of columns
398
399 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000400
401 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000402 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000403 thisType(row[col])
404 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000405 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000406 pass
407 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000408 # fallback to length of string
409 thisType = len(row[col])
410
Raymond Hettinger39a55922003-06-12 03:01:55 +0000411 # treat longs as ints
412 if thisType == long:
413 thisType = int
414
Skip Montanaro04ae7052003-04-24 20:21:31 +0000415 if thisType != columnTypes[col]:
416 if columnTypes[col] is None: # add new column type
417 columnTypes[col] = thisType
418 else:
419 # type is inconsistent, remove column from
420 # consideration
421 del columnTypes[col]
422
423 # finally, compare results against first row and "vote"
424 # on whether it's a header
425 hasHeader = 0
426 for col, colType in columnTypes.items():
427 if type(colType) == type(0): # it's a length
428 if len(header[col]) != colType:
429 hasHeader += 1
430 else:
431 hasHeader -= 1
432 else: # attempt typecast
433 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000434 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000435 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000436 hasHeader += 1
437 else:
438 hasHeader -= 1
439
440 return hasHeader > 0