blob: f2389fd30425944a39163b92ae7b36f118efe300 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
Skip Montanaro1448d472003-04-25 14:47:16 +000012try:
13 from cStringIO import StringIO
14except ImportError:
15 from StringIO import StringIO
16
Skip Montanaro04ae7052003-04-24 20:21:31 +000017__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
23 _name = ""
24 _valid = False
25 # placeholders
26 delimiter = None
27 quotechar = None
28 escapechar = None
29 doublequote = None
30 skipinitialspace = None
31 lineterminator = None
32 quoting = None
33
34 def __init__(self):
35 if self.__class__ != Dialect:
36 self._valid = True
37 errors = self._validate()
38 if errors != []:
39 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
40
41 def _validate(self):
42 errors = []
43 if not self._valid:
44 errors.append("can't directly instantiate Dialect class")
45
46 if self.delimiter is None:
47 errors.append("delimiter character not set")
48 elif (not isinstance(self.delimiter, str) or
49 len(self.delimiter) > 1):
50 errors.append("delimiter must be one-character string")
51
52 if self.quotechar is None:
53 if self.quoting != QUOTE_NONE:
54 errors.append("quotechar not set")
55 elif (not isinstance(self.quotechar, str) or
56 len(self.quotechar) > 1):
57 errors.append("quotechar must be one-character string")
58
59 if self.lineterminator is None:
60 errors.append("lineterminator not set")
61 elif not isinstance(self.lineterminator, str):
62 errors.append("lineterminator must be a string")
63
64 if self.doublequote not in (True, False):
65 errors.append("doublequote parameter must be True or False")
66
67 if self.skipinitialspace not in (True, False):
68 errors.append("skipinitialspace parameter must be True or False")
69
70 if self.quoting is None:
71 errors.append("quoting parameter not set")
72
73 if self.quoting is QUOTE_NONE:
74 if (not isinstance(self.escapechar, (unicode, str)) or
75 len(self.escapechar) > 1):
76 errors.append("escapechar must be a one-character string or unicode object")
77
78 return errors
79
80class excel(Dialect):
81 delimiter = ','
82 quotechar = '"'
83 doublequote = True
84 skipinitialspace = False
85 lineterminator = '\r\n'
86 quoting = QUOTE_MINIMAL
87register_dialect("excel", excel)
88
89class excel_tab(excel):
90 delimiter = '\t'
91register_dialect("excel-tab", excel_tab)
92
93
94class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000095 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000096 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000097 self.fieldnames = fieldnames # list of keys for the dict
98 self.restkey = restkey # key to catch long rows
99 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000100 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000101
102 def __iter__(self):
103 return self
104
105 def next(self):
106 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +0000107 if self.fieldnames is None:
108 self.fieldnames = row
109 row = self.reader.next()
110
Skip Montanaro04ae7052003-04-24 20:21:31 +0000111 # unlike the basic reader, we prefer not to return blanks,
112 # because we will typically wind up with a dict full of None
113 # values
114 while row == []:
115 row = self.reader.next()
116 d = dict(zip(self.fieldnames, row))
117 lf = len(self.fieldnames)
118 lr = len(row)
119 if lf < lr:
120 d[self.restkey] = row[lf:]
121 elif lf > lr:
122 for key in self.fieldnames[lr:]:
123 d[key] = self.restval
124 return d
125
126
127class DictWriter:
128 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000129 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000130 self.fieldnames = fieldnames # list of keys for the dict
131 self.restval = restval # for writing short dicts
132 if extrasaction.lower() not in ("raise", "ignore"):
133 raise ValueError, \
134 ("extrasaction (%s) must be 'raise' or 'ignore'" %
135 extrasaction)
136 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000137 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000138
139 def _dict_to_list(self, rowdict):
140 if self.extrasaction == "raise":
141 for k in rowdict.keys():
142 if k not in self.fieldnames:
143 raise ValueError, "dict contains fields not in fieldnames"
144 return [rowdict.get(key, self.restval) for key in self.fieldnames]
145
146 def writerow(self, rowdict):
147 return self.writer.writerow(self._dict_to_list(rowdict))
148
149 def writerows(self, rowdicts):
150 rows = []
151 for rowdict in rowdicts:
152 rows.append(self._dict_to_list(rowdict))
153 return self.writer.writerows(rows)
154
Raymond Hettinger39a55922003-06-12 03:01:55 +0000155# Guard Sniffer's type checking against builds that exclude complex()
156try:
157 complex
158except NameError:
159 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000160
161class Sniffer:
162 '''
163 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000164 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000166 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167 # in case there is more than one possible delimiter
168 self.preferred = [',', '\t', ';', ' ', ':']
169
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170
Skip Montanaro77892372003-05-19 15:33:36 +0000171 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000173 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175
176 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000177 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000179 delimiter, skipinitialspace = self._guess_delimiter(sample,
180 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181
Skip Montanaro1448d472003-04-25 14:47:16 +0000182 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000183 _name = "sniffed"
184 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000185 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186 # escapechar = ''
187 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000188
Skip Montanaro1448d472003-04-25 14:47:16 +0000189 dialect.delimiter = delimiter
190 # _csv.reader won't accept a quotechar of ''
191 dialect.quotechar = quotechar or '"'
192 dialect.skipinitialspace = skipinitialspace
193
194 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195
196
Skip Montanaro77892372003-05-19 15:33:36 +0000197 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000198 """
199 Looks for text enclosed between two identical quotes
200 (the probable quotechar) which are preceded and followed
201 by the same character (the probable delimiter).
202 For example:
203 ,'some text',
204 The quote with the most wins, same with the delimiter.
205 If there is no quotechar the delimiter can't be determined
206 this way.
207 """
208
209 matches = []
210 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000214 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000215 matches = regexp.findall(data)
216 if matches:
217 break
218
219 if not matches:
220 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
221
222 quotes = {}
223 delims = {}
224 spaces = 0
225 for m in matches:
226 n = regexp.groupindex['quote'] - 1
227 key = m[n]
228 if key:
229 quotes[key] = quotes.get(key, 0) + 1
230 try:
231 n = regexp.groupindex['delim'] - 1
232 key = m[n]
233 except KeyError:
234 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000235 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000236 delims[key] = delims.get(key, 0) + 1
237 try:
238 n = regexp.groupindex['space'] - 1
239 except KeyError:
240 continue
241 if m[n]:
242 spaces += 1
243
244 quotechar = reduce(lambda a, b, quotes = quotes:
245 (quotes[a] > quotes[b]) and a or b, quotes.keys())
246
247 if delims:
248 delim = reduce(lambda a, b, delims = delims:
249 (delims[a] > delims[b]) and a or b, delims.keys())
250 skipinitialspace = delims[delim] == spaces
251 if delim == '\n': # most likely a file with a single column
252 delim = ''
253 else:
254 # there is *no* delimiter, it's a single column of quoted data
255 delim = ''
256 skipinitialspace = 0
257
258 return (quotechar, delim, skipinitialspace)
259
260
Skip Montanaro77892372003-05-19 15:33:36 +0000261 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000262 """
263 The delimiter /should/ occur the same number of times on
264 each row. However, due to malformed data, it may not. We don't want
265 an all or nothing approach, so we allow for small variations in this
266 number.
267 1) build a table of the frequency of each character on every line.
268 2) build a table of freqencies of this frequency (meta-frequency?),
269 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
270 7 times in 2 rows'
271 3) use the mode of the meta-frequency to determine the /expected/
272 frequency for that character
273 4) find out how often the character actually meets that goal
274 5) the character that best meets its goal is the delimiter
275 For performance reasons, the data is evaluated in chunks, so it can
276 try and evaluate the smallest portion of the data possible, evaluating
277 additional chunks as necessary.
278 """
279
280 data = filter(None, data.split('\n'))
281
282 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
283
284 # build frequency tables
285 chunkLength = min(10, len(data))
286 iteration = 0
287 charFrequency = {}
288 modes = {}
289 delims = {}
290 start, end = 0, min(chunkLength, len(data))
291 while start < len(data):
292 iteration += 1
293 for line in data[start:end]:
294 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000295 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000296 # must count even if frequency is 0
297 freq = line.strip().count(char)
298 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000299 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
300 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000301
302 for char in charFrequency.keys():
303 items = charFrequency[char].items()
304 if len(items) == 1 and items[0][0] == 0:
305 continue
306 # get the mode of the frequencies
307 if len(items) > 1:
308 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
309 items)
310 # adjust the mode - subtract the sum of all
311 # other frequencies
312 items.remove(modes[char])
313 modes[char] = (modes[char][0], modes[char][1]
314 - reduce(lambda a, b: (0, a[1] + b[1]),
315 items)[1])
316 else:
317 modes[char] = items[0]
318
319 # build a list of possible delimiters
320 modeList = modes.items()
321 total = float(chunkLength * iteration)
322 # (rows of consistent data) / (number of rows) = 100%
323 consistency = 1.0
324 # minimum consistency threshold
325 threshold = 0.9
326 while len(delims) == 0 and consistency >= threshold:
327 for k, v in modeList:
328 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000329 if ((v[1]/total) >= consistency and
330 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000331 delims[k] = v
332 consistency -= 0.01
333
334 if len(delims) == 1:
335 delim = delims.keys()[0]
336 skipinitialspace = (data[0].count(delim) ==
337 data[0].count("%c " % delim))
338 return (delim, skipinitialspace)
339
340 # analyze another chunkLength lines
341 start = end
342 end += chunkLength
343
344 if not delims:
345 return ('', 0)
346
347 # if there's more than one, fall back to a 'preferred' list
348 if len(delims) > 1:
349 for d in self.preferred:
350 if d in delims.keys():
351 skipinitialspace = (data[0].count(d) ==
352 data[0].count("%c " % d))
353 return (d, skipinitialspace)
354
355 # finally, just return the first damn character in the list
356 delim = delims.keys()[0]
357 skipinitialspace = (data[0].count(delim) ==
358 data[0].count("%c " % delim))
359 return (delim, skipinitialspace)
360
361
Skip Montanaro1448d472003-04-25 14:47:16 +0000362 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000363 # Creates a dictionary of types of data in each column. If any
364 # column is of a single type (say, integers), *except* for the first
365 # row, then the first row is presumed to be labels. If the type
366 # can't be determined, it is assumed to be a string in which case
367 # the length of the string is the determining factor: if all of the
368 # rows except for the first are the same length, it's a header.
369 # Finally, a 'vote' is taken at the end for each column, adding or
370 # subtracting from the likelihood of the first row being a header.
371
Skip Montanaro1448d472003-04-25 14:47:16 +0000372 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000373
Skip Montanaro1448d472003-04-25 14:47:16 +0000374 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000375
376 columns = len(header)
377 columnTypes = {}
378 for i in range(columns): columnTypes[i] = None
379
380 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000381 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000382 # arbitrary number of rows to check, to keep it sane
383 if checked > 20:
384 break
385 checked += 1
386
387 if len(row) != columns:
388 continue # skip rows that have irregular number of columns
389
390 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000391
392 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000394 thisType(row[col])
395 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000396 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000397 pass
398 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000399 # fallback to length of string
400 thisType = len(row[col])
401
Raymond Hettinger39a55922003-06-12 03:01:55 +0000402 # treat longs as ints
403 if thisType == long:
404 thisType = int
405
Skip Montanaro04ae7052003-04-24 20:21:31 +0000406 if thisType != columnTypes[col]:
407 if columnTypes[col] is None: # add new column type
408 columnTypes[col] = thisType
409 else:
410 # type is inconsistent, remove column from
411 # consideration
412 del columnTypes[col]
413
414 # finally, compare results against first row and "vote"
415 # on whether it's a header
416 hasHeader = 0
417 for col, colType in columnTypes.items():
418 if type(colType) == type(0): # it's a length
419 if len(header[col]) != colType:
420 hasHeader += 1
421 else:
422 hasHeader -= 1
423 else: # attempt typecast
424 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000425 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000426 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000427 hasHeader += 1
428 else:
429 hasHeader -= 1
430
431 return hasHeader > 0