blob: f213854783eb67f1483e3abeeb94c70afefa8193 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
82 def next(self):
83 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +000084 if self.fieldnames is None:
85 self.fieldnames = row
86 row = self.reader.next()
87
Skip Montanaro04ae7052003-04-24 20:21:31 +000088 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
90 # values
91 while row == []:
92 row = self.reader.next()
93 d = dict(zip(self.fieldnames, row))
94 lf = len(self.fieldnames)
95 lr = len(row)
96 if lf < lr:
97 d[self.restkey] = row[lf:]
98 elif lf > lr:
99 for key in self.fieldnames[lr:]:
100 d[key] = self.restval
101 return d
102
103
104class DictWriter:
105 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000106 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 self.fieldnames = fieldnames # list of keys for the dict
108 self.restval = restval # for writing short dicts
109 if extrasaction.lower() not in ("raise", "ignore"):
110 raise ValueError, \
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
112 extrasaction)
113 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000114 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000115
116 def _dict_to_list(self, rowdict):
117 if self.extrasaction == "raise":
118 for k in rowdict.keys():
119 if k not in self.fieldnames:
120 raise ValueError, "dict contains fields not in fieldnames"
121 return [rowdict.get(key, self.restval) for key in self.fieldnames]
122
123 def writerow(self, rowdict):
124 return self.writer.writerow(self._dict_to_list(rowdict))
125
126 def writerows(self, rowdicts):
127 rows = []
128 for rowdict in rowdicts:
129 rows.append(self._dict_to_list(rowdict))
130 return self.writer.writerows(rows)
131
Raymond Hettinger39a55922003-06-12 03:01:55 +0000132# Guard Sniffer's type checking against builds that exclude complex()
133try:
134 complex
135except NameError:
136 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000137
138class Sniffer:
139 '''
140 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000141 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000142 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000143 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000144 # in case there is more than one possible delimiter
145 self.preferred = [',', '\t', ';', ' ', ':']
146
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147
Skip Montanaro77892372003-05-19 15:33:36 +0000148 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000150 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152
153 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000154 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000155 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000156 delimiter, skipinitialspace = self._guess_delimiter(sample,
157 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158
Skip Montanaro39b29be2005-12-30 05:09:48 +0000159 if not delimiter:
160 raise Error, "Could not determine delimiter"
161
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 _name = "sniffed"
164 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000165 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 # escapechar = ''
167 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 dialect.delimiter = delimiter
170 # _csv.reader won't accept a quotechar of ''
171 dialect.quotechar = quotechar or '"'
172 dialect.skipinitialspace = skipinitialspace
173
174 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175
176
Skip Montanaro77892372003-05-19 15:33:36 +0000177 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178 """
179 Looks for text enclosed between two identical quotes
180 (the probable quotechar) which are preceded and followed
181 by the same character (the probable delimiter).
182 For example:
183 ,'some text',
184 The quote with the most wins, same with the delimiter.
185 If there is no quotechar the delimiter can't be determined
186 this way.
187 """
188
189 matches = []
190 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
191 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
192 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
193 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000194 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195 matches = regexp.findall(data)
196 if matches:
197 break
198
199 if not matches:
200 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
201
202 quotes = {}
203 delims = {}
204 spaces = 0
205 for m in matches:
206 n = regexp.groupindex['quote'] - 1
207 key = m[n]
208 if key:
209 quotes[key] = quotes.get(key, 0) + 1
210 try:
211 n = regexp.groupindex['delim'] - 1
212 key = m[n]
213 except KeyError:
214 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000215 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000216 delims[key] = delims.get(key, 0) + 1
217 try:
218 n = regexp.groupindex['space'] - 1
219 except KeyError:
220 continue
221 if m[n]:
222 spaces += 1
223
224 quotechar = reduce(lambda a, b, quotes = quotes:
225 (quotes[a] > quotes[b]) and a or b, quotes.keys())
226
227 if delims:
228 delim = reduce(lambda a, b, delims = delims:
229 (delims[a] > delims[b]) and a or b, delims.keys())
230 skipinitialspace = delims[delim] == spaces
231 if delim == '\n': # most likely a file with a single column
232 delim = ''
233 else:
234 # there is *no* delimiter, it's a single column of quoted data
235 delim = ''
236 skipinitialspace = 0
237
238 return (quotechar, delim, skipinitialspace)
239
240
Skip Montanaro77892372003-05-19 15:33:36 +0000241 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000242 """
243 The delimiter /should/ occur the same number of times on
244 each row. However, due to malformed data, it may not. We don't want
245 an all or nothing approach, so we allow for small variations in this
246 number.
247 1) build a table of the frequency of each character on every line.
248 2) build a table of freqencies of this frequency (meta-frequency?),
249 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
250 7 times in 2 rows'
251 3) use the mode of the meta-frequency to determine the /expected/
252 frequency for that character
253 4) find out how often the character actually meets that goal
254 5) the character that best meets its goal is the delimiter
255 For performance reasons, the data is evaluated in chunks, so it can
256 try and evaluate the smallest portion of the data possible, evaluating
257 additional chunks as necessary.
258 """
259
260 data = filter(None, data.split('\n'))
261
262 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
263
264 # build frequency tables
265 chunkLength = min(10, len(data))
266 iteration = 0
267 charFrequency = {}
268 modes = {}
269 delims = {}
270 start, end = 0, min(chunkLength, len(data))
271 while start < len(data):
272 iteration += 1
273 for line in data[start:end]:
274 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000275 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000276 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000277 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000278 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000279 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
280 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000281
282 for char in charFrequency.keys():
283 items = charFrequency[char].items()
284 if len(items) == 1 and items[0][0] == 0:
285 continue
286 # get the mode of the frequencies
287 if len(items) > 1:
288 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
289 items)
290 # adjust the mode - subtract the sum of all
291 # other frequencies
292 items.remove(modes[char])
293 modes[char] = (modes[char][0], modes[char][1]
294 - reduce(lambda a, b: (0, a[1] + b[1]),
295 items)[1])
296 else:
297 modes[char] = items[0]
298
299 # build a list of possible delimiters
300 modeList = modes.items()
301 total = float(chunkLength * iteration)
302 # (rows of consistent data) / (number of rows) = 100%
303 consistency = 1.0
304 # minimum consistency threshold
305 threshold = 0.9
306 while len(delims) == 0 and consistency >= threshold:
307 for k, v in modeList:
308 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000309 if ((v[1]/total) >= consistency and
310 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000311 delims[k] = v
312 consistency -= 0.01
313
314 if len(delims) == 1:
315 delim = delims.keys()[0]
316 skipinitialspace = (data[0].count(delim) ==
317 data[0].count("%c " % delim))
318 return (delim, skipinitialspace)
319
320 # analyze another chunkLength lines
321 start = end
322 end += chunkLength
323
324 if not delims:
325 return ('', 0)
326
327 # if there's more than one, fall back to a 'preferred' list
328 if len(delims) > 1:
329 for d in self.preferred:
330 if d in delims.keys():
331 skipinitialspace = (data[0].count(d) ==
332 data[0].count("%c " % d))
333 return (d, skipinitialspace)
334
Skip Montanaro39b29be2005-12-30 05:09:48 +0000335 # nothing else indicates a preference, pick the character that
336 # dominates(?)
337 items = [(v,k) for (k,v) in delims.items()]
338 items.sort()
339 delim = items[-1][1]
340
Skip Montanaro04ae7052003-04-24 20:21:31 +0000341 skipinitialspace = (data[0].count(delim) ==
342 data[0].count("%c " % delim))
343 return (delim, skipinitialspace)
344
345
Skip Montanaro1448d472003-04-25 14:47:16 +0000346 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000347 # Creates a dictionary of types of data in each column. If any
348 # column is of a single type (say, integers), *except* for the first
349 # row, then the first row is presumed to be labels. If the type
350 # can't be determined, it is assumed to be a string in which case
351 # the length of the string is the determining factor: if all of the
352 # rows except for the first are the same length, it's a header.
353 # Finally, a 'vote' is taken at the end for each column, adding or
354 # subtracting from the likelihood of the first row being a header.
355
Skip Montanaro1448d472003-04-25 14:47:16 +0000356 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000357
Skip Montanaro1448d472003-04-25 14:47:16 +0000358 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000359
360 columns = len(header)
361 columnTypes = {}
362 for i in range(columns): columnTypes[i] = None
363
364 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000365 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000366 # arbitrary number of rows to check, to keep it sane
367 if checked > 20:
368 break
369 checked += 1
370
371 if len(row) != columns:
372 continue # skip rows that have irregular number of columns
373
374 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000375
376 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000377 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000378 thisType(row[col])
379 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000380 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000381 pass
382 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000383 # fallback to length of string
384 thisType = len(row[col])
385
Raymond Hettinger39a55922003-06-12 03:01:55 +0000386 # treat longs as ints
387 if thisType == long:
388 thisType = int
389
Skip Montanaro04ae7052003-04-24 20:21:31 +0000390 if thisType != columnTypes[col]:
391 if columnTypes[col] is None: # add new column type
392 columnTypes[col] = thisType
393 else:
394 # type is inconsistent, remove column from
395 # consideration
396 del columnTypes[col]
397
398 # finally, compare results against first row and "vote"
399 # on whether it's a header
400 hasHeader = 0
401 for col, colType in columnTypes.items():
402 if type(colType) == type(0): # it's a length
403 if len(header[col]) != colType:
404 hasHeader += 1
405 else:
406 hasHeader -= 1
407 else: # attempt typecast
408 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000409 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000410 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000411 hasHeader += 1
412 else:
413 hasHeader -= 1
414
415 return hasHeader > 0