blob: 7e297b69e7a43073f7b102a6069ac9d54fbe1fde [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
Skip Montanaro1448d472003-04-25 14:47:16 +000012try:
13 from cStringIO import StringIO
14except ImportError:
15 from StringIO import StringIO
16
Skip Montanaro04ae7052003-04-24 20:21:31 +000017__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
23 _name = ""
24 _valid = False
25 # placeholders
26 delimiter = None
27 quotechar = None
28 escapechar = None
29 doublequote = None
30 skipinitialspace = None
31 lineterminator = None
32 quoting = None
33
34 def __init__(self):
35 if self.__class__ != Dialect:
36 self._valid = True
37 errors = self._validate()
38 if errors != []:
39 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
40
41 def _validate(self):
42 errors = []
43 if not self._valid:
44 errors.append("can't directly instantiate Dialect class")
45
46 if self.delimiter is None:
47 errors.append("delimiter character not set")
48 elif (not isinstance(self.delimiter, str) or
49 len(self.delimiter) > 1):
50 errors.append("delimiter must be one-character string")
51
52 if self.quotechar is None:
53 if self.quoting != QUOTE_NONE:
54 errors.append("quotechar not set")
55 elif (not isinstance(self.quotechar, str) or
56 len(self.quotechar) > 1):
57 errors.append("quotechar must be one-character string")
58
59 if self.lineterminator is None:
60 errors.append("lineterminator not set")
61 elif not isinstance(self.lineterminator, str):
62 errors.append("lineterminator must be a string")
63
64 if self.doublequote not in (True, False):
65 errors.append("doublequote parameter must be True or False")
66
67 if self.skipinitialspace not in (True, False):
68 errors.append("skipinitialspace parameter must be True or False")
69
70 if self.quoting is None:
71 errors.append("quoting parameter not set")
72
73 if self.quoting is QUOTE_NONE:
74 if (not isinstance(self.escapechar, (unicode, str)) or
75 len(self.escapechar) > 1):
76 errors.append("escapechar must be a one-character string or unicode object")
77
78 return errors
79
80class excel(Dialect):
81 delimiter = ','
82 quotechar = '"'
83 doublequote = True
84 skipinitialspace = False
85 lineterminator = '\r\n'
86 quoting = QUOTE_MINIMAL
87register_dialect("excel", excel)
88
89class excel_tab(excel):
90 delimiter = '\t'
91register_dialect("excel-tab", excel_tab)
92
93
94class DictReader:
95 def __init__(self, f, fieldnames, restkey=None, restval=None,
96 dialect="excel", *args):
97 self.fieldnames = fieldnames # list of keys for the dict
98 self.restkey = restkey # key to catch long rows
99 self.restval = restval # default value for short rows
100 self.reader = reader(f, dialect, *args)
101
102 def __iter__(self):
103 return self
104
105 def next(self):
106 row = self.reader.next()
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125 dialect="excel", *args):
126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
133 self.writer = writer(f, dialect, *args)
134
135 def _dict_to_list(self, rowdict):
136 if self.extrasaction == "raise":
137 for k in rowdict.keys():
138 if k not in self.fieldnames:
139 raise ValueError, "dict contains fields not in fieldnames"
140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
141
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
144
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
150
151
152class Sniffer:
153 '''
154 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000155 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000157 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158 # in case there is more than one possible delimiter
159 self.preferred = [',', '\t', ';', ' ', ':']
160
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 def sniff(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000164 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
167 quotechar, delimiter, skipinitialspace = \
Skip Montanaro1448d472003-04-25 14:47:16 +0000168 self._guess_quote_and_delimiter(sample)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169 if delimiter is None:
Skip Montanaro1448d472003-04-25 14:47:16 +0000170 delimiter, skipinitialspace = self._guess_delimiter(sample)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
Skip Montanaro1448d472003-04-25 14:47:16 +0000172 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000173 _name = "sniffed"
174 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000175 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176 # escapechar = ''
177 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178
Skip Montanaro1448d472003-04-25 14:47:16 +0000179 dialect.delimiter = delimiter
180 # _csv.reader won't accept a quotechar of ''
181 dialect.quotechar = quotechar or '"'
182 dialect.skipinitialspace = skipinitialspace
183
184 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000185
186
Skip Montanaro1448d472003-04-25 14:47:16 +0000187 def _guess_quote_and_delimiter(self, data):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000188 """
189 Looks for text enclosed between two identical quotes
190 (the probable quotechar) which are preceded and followed
191 by the same character (the probable delimiter).
192 For example:
193 ,'some text',
194 The quote with the most wins, same with the delimiter.
195 If there is no quotechar the delimiter can't be determined
196 this way.
197 """
198
199 matches = []
200 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
201 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
202 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
203 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
204 regexp = re.compile(restr, re.S | re.M)
205 matches = regexp.findall(data)
206 if matches:
207 break
208
209 if not matches:
210 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
211
212 quotes = {}
213 delims = {}
214 spaces = 0
215 for m in matches:
216 n = regexp.groupindex['quote'] - 1
217 key = m[n]
218 if key:
219 quotes[key] = quotes.get(key, 0) + 1
220 try:
221 n = regexp.groupindex['delim'] - 1
222 key = m[n]
223 except KeyError:
224 continue
225 if key:
226 delims[key] = delims.get(key, 0) + 1
227 try:
228 n = regexp.groupindex['space'] - 1
229 except KeyError:
230 continue
231 if m[n]:
232 spaces += 1
233
234 quotechar = reduce(lambda a, b, quotes = quotes:
235 (quotes[a] > quotes[b]) and a or b, quotes.keys())
236
237 if delims:
238 delim = reduce(lambda a, b, delims = delims:
239 (delims[a] > delims[b]) and a or b, delims.keys())
240 skipinitialspace = delims[delim] == spaces
241 if delim == '\n': # most likely a file with a single column
242 delim = ''
243 else:
244 # there is *no* delimiter, it's a single column of quoted data
245 delim = ''
246 skipinitialspace = 0
247
248 return (quotechar, delim, skipinitialspace)
249
250
Skip Montanaro1448d472003-04-25 14:47:16 +0000251 def _guess_delimiter(self, data):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000252 """
253 The delimiter /should/ occur the same number of times on
254 each row. However, due to malformed data, it may not. We don't want
255 an all or nothing approach, so we allow for small variations in this
256 number.
257 1) build a table of the frequency of each character on every line.
258 2) build a table of freqencies of this frequency (meta-frequency?),
259 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
260 7 times in 2 rows'
261 3) use the mode of the meta-frequency to determine the /expected/
262 frequency for that character
263 4) find out how often the character actually meets that goal
264 5) the character that best meets its goal is the delimiter
265 For performance reasons, the data is evaluated in chunks, so it can
266 try and evaluate the smallest portion of the data possible, evaluating
267 additional chunks as necessary.
268 """
269
270 data = filter(None, data.split('\n'))
271
272 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
273
274 # build frequency tables
275 chunkLength = min(10, len(data))
276 iteration = 0
277 charFrequency = {}
278 modes = {}
279 delims = {}
280 start, end = 0, min(chunkLength, len(data))
281 while start < len(data):
282 iteration += 1
283 for line in data[start:end]:
284 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000285 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000286 # must count even if frequency is 0
287 freq = line.strip().count(char)
288 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000289 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
290 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000291
292 for char in charFrequency.keys():
293 items = charFrequency[char].items()
294 if len(items) == 1 and items[0][0] == 0:
295 continue
296 # get the mode of the frequencies
297 if len(items) > 1:
298 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
299 items)
300 # adjust the mode - subtract the sum of all
301 # other frequencies
302 items.remove(modes[char])
303 modes[char] = (modes[char][0], modes[char][1]
304 - reduce(lambda a, b: (0, a[1] + b[1]),
305 items)[1])
306 else:
307 modes[char] = items[0]
308
309 # build a list of possible delimiters
310 modeList = modes.items()
311 total = float(chunkLength * iteration)
312 # (rows of consistent data) / (number of rows) = 100%
313 consistency = 1.0
314 # minimum consistency threshold
315 threshold = 0.9
316 while len(delims) == 0 and consistency >= threshold:
317 for k, v in modeList:
318 if v[0] > 0 and v[1] > 0:
319 if (v[1]/total) >= consistency:
320 delims[k] = v
321 consistency -= 0.01
322
323 if len(delims) == 1:
324 delim = delims.keys()[0]
325 skipinitialspace = (data[0].count(delim) ==
326 data[0].count("%c " % delim))
327 return (delim, skipinitialspace)
328
329 # analyze another chunkLength lines
330 start = end
331 end += chunkLength
332
333 if not delims:
334 return ('', 0)
335
336 # if there's more than one, fall back to a 'preferred' list
337 if len(delims) > 1:
338 for d in self.preferred:
339 if d in delims.keys():
340 skipinitialspace = (data[0].count(d) ==
341 data[0].count("%c " % d))
342 return (d, skipinitialspace)
343
344 # finally, just return the first damn character in the list
345 delim = delims.keys()[0]
346 skipinitialspace = (data[0].count(delim) ==
347 data[0].count("%c " % delim))
348 return (delim, skipinitialspace)
349
350
Skip Montanaro1448d472003-04-25 14:47:16 +0000351 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000352 # Creates a dictionary of types of data in each column. If any
353 # column is of a single type (say, integers), *except* for the first
354 # row, then the first row is presumed to be labels. If the type
355 # can't be determined, it is assumed to be a string in which case
356 # the length of the string is the determining factor: if all of the
357 # rows except for the first are the same length, it's a header.
358 # Finally, a 'vote' is taken at the end for each column, adding or
359 # subtracting from the likelihood of the first row being a header.
360
361 def seval(item):
362 """
363 Strips parens from item prior to calling eval in an
364 attempt to make it safer
365 """
366 return eval(item.replace('(', '').replace(')', ''))
367
Skip Montanaro1448d472003-04-25 14:47:16 +0000368 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000369
Skip Montanaro1448d472003-04-25 14:47:16 +0000370 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000371
372 columns = len(header)
373 columnTypes = {}
374 for i in range(columns): columnTypes[i] = None
375
376 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000377 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000378 # arbitrary number of rows to check, to keep it sane
379 if checked > 20:
380 break
381 checked += 1
382
383 if len(row) != columns:
384 continue # skip rows that have irregular number of columns
385
386 for col in columnTypes.keys():
387 try:
388 try:
389 # is it a built-in type (besides string)?
390 thisType = type(seval(row[col]))
391 except OverflowError:
392 # a long int?
393 thisType = type(seval(row[col] + 'L'))
394 thisType = type(0) # treat long ints as int
395 except:
396 # fallback to length of string
397 thisType = len(row[col])
398
399 if thisType != columnTypes[col]:
400 if columnTypes[col] is None: # add new column type
401 columnTypes[col] = thisType
402 else:
403 # type is inconsistent, remove column from
404 # consideration
405 del columnTypes[col]
406
407 # finally, compare results against first row and "vote"
408 # on whether it's a header
409 hasHeader = 0
410 for col, colType in columnTypes.items():
411 if type(colType) == type(0): # it's a length
412 if len(header[col]) != colType:
413 hasHeader += 1
414 else:
415 hasHeader -= 1
416 else: # attempt typecast
417 try:
418 eval("%s(%s)" % (colType.__name__, header[col]))
419 except:
420 hasHeader += 1
421 else:
422 hasHeader -= 1
423
424 return hasHeader > 0