blob: c8e84813965c17318adc283b8cfa8876394562b8 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandld9b9d682008-03-21 20:01:55 +000078 self.dialect = dialect
79 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000080
81 def __iter__(self):
82 return self
83
84 def next(self):
85 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +000086 if self.fieldnames is None:
87 self.fieldnames = row
88 row = self.reader.next()
Georg Brandld9b9d682008-03-21 20:01:55 +000089 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +000090
Skip Montanaro04ae7052003-04-24 20:21:31 +000091 # unlike the basic reader, we prefer not to return blanks,
92 # because we will typically wind up with a dict full of None
93 # values
94 while row == []:
95 row = self.reader.next()
96 d = dict(zip(self.fieldnames, row))
97 lf = len(self.fieldnames)
98 lr = len(row)
99 if lf < lr:
100 d[self.restkey] = row[lf:]
101 elif lf > lr:
102 for key in self.fieldnames[lr:]:
103 d[key] = self.restval
104 return d
105
106
107class DictWriter:
108 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000109 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000110 self.fieldnames = fieldnames # list of keys for the dict
111 self.restval = restval # for writing short dicts
112 if extrasaction.lower() not in ("raise", "ignore"):
113 raise ValueError, \
114 ("extrasaction (%s) must be 'raise' or 'ignore'" %
115 extrasaction)
116 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000117 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000118
119 def _dict_to_list(self, rowdict):
120 if self.extrasaction == "raise":
121 for k in rowdict.keys():
122 if k not in self.fieldnames:
123 raise ValueError, "dict contains fields not in fieldnames"
124 return [rowdict.get(key, self.restval) for key in self.fieldnames]
125
126 def writerow(self, rowdict):
127 return self.writer.writerow(self._dict_to_list(rowdict))
128
129 def writerows(self, rowdicts):
130 rows = []
131 for rowdict in rowdicts:
132 rows.append(self._dict_to_list(rowdict))
133 return self.writer.writerows(rows)
134
Raymond Hettinger39a55922003-06-12 03:01:55 +0000135# Guard Sniffer's type checking against builds that exclude complex()
136try:
137 complex
138except NameError:
139 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000140
141class Sniffer:
142 '''
143 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000144 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000145 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000146 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147 # in case there is more than one possible delimiter
148 self.preferred = [',', '\t', ';', ' ', ':']
149
Skip Montanaro04ae7052003-04-24 20:21:31 +0000150
Skip Montanaro77892372003-05-19 15:33:36 +0000151 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000153 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000154 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000155
156 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000157 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000158 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000159 delimiter, skipinitialspace = self._guess_delimiter(sample,
160 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161
Skip Montanaro39b29be2005-12-30 05:09:48 +0000162 if not delimiter:
163 raise Error, "Could not determine delimiter"
164
Skip Montanaro1448d472003-04-25 14:47:16 +0000165 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 _name = "sniffed"
167 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000168 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169 # escapechar = ''
170 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
Skip Montanaro1448d472003-04-25 14:47:16 +0000172 dialect.delimiter = delimiter
173 # _csv.reader won't accept a quotechar of ''
174 dialect.quotechar = quotechar or '"'
175 dialect.skipinitialspace = skipinitialspace
176
177 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178
179
Skip Montanaro77892372003-05-19 15:33:36 +0000180 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181 """
182 Looks for text enclosed between two identical quotes
183 (the probable quotechar) which are preceded and followed
184 by the same character (the probable delimiter).
185 For example:
186 ,'some text',
187 The quote with the most wins, same with the delimiter.
188 If there is no quotechar the delimiter can't be determined
189 this way.
190 """
191
192 matches = []
193 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
194 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
195 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
196 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000197 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000198 matches = regexp.findall(data)
199 if matches:
200 break
201
202 if not matches:
203 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
204
205 quotes = {}
206 delims = {}
207 spaces = 0
208 for m in matches:
209 n = regexp.groupindex['quote'] - 1
210 key = m[n]
211 if key:
212 quotes[key] = quotes.get(key, 0) + 1
213 try:
214 n = regexp.groupindex['delim'] - 1
215 key = m[n]
216 except KeyError:
217 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000218 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000219 delims[key] = delims.get(key, 0) + 1
220 try:
221 n = regexp.groupindex['space'] - 1
222 except KeyError:
223 continue
224 if m[n]:
225 spaces += 1
226
227 quotechar = reduce(lambda a, b, quotes = quotes:
228 (quotes[a] > quotes[b]) and a or b, quotes.keys())
229
230 if delims:
231 delim = reduce(lambda a, b, delims = delims:
232 (delims[a] > delims[b]) and a or b, delims.keys())
233 skipinitialspace = delims[delim] == spaces
234 if delim == '\n': # most likely a file with a single column
235 delim = ''
236 else:
237 # there is *no* delimiter, it's a single column of quoted data
238 delim = ''
239 skipinitialspace = 0
240
241 return (quotechar, delim, skipinitialspace)
242
243
Skip Montanaro77892372003-05-19 15:33:36 +0000244 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000245 """
246 The delimiter /should/ occur the same number of times on
247 each row. However, due to malformed data, it may not. We don't want
248 an all or nothing approach, so we allow for small variations in this
249 number.
250 1) build a table of the frequency of each character on every line.
251 2) build a table of freqencies of this frequency (meta-frequency?),
252 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
253 7 times in 2 rows'
254 3) use the mode of the meta-frequency to determine the /expected/
255 frequency for that character
256 4) find out how often the character actually meets that goal
257 5) the character that best meets its goal is the delimiter
258 For performance reasons, the data is evaluated in chunks, so it can
259 try and evaluate the smallest portion of the data possible, evaluating
260 additional chunks as necessary.
261 """
262
263 data = filter(None, data.split('\n'))
264
265 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
266
267 # build frequency tables
268 chunkLength = min(10, len(data))
269 iteration = 0
270 charFrequency = {}
271 modes = {}
272 delims = {}
273 start, end = 0, min(chunkLength, len(data))
274 while start < len(data):
275 iteration += 1
276 for line in data[start:end]:
277 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000278 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000279 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000280 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000281 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000282 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
283 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000284
285 for char in charFrequency.keys():
286 items = charFrequency[char].items()
287 if len(items) == 1 and items[0][0] == 0:
288 continue
289 # get the mode of the frequencies
290 if len(items) > 1:
291 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
292 items)
293 # adjust the mode - subtract the sum of all
294 # other frequencies
295 items.remove(modes[char])
296 modes[char] = (modes[char][0], modes[char][1]
297 - reduce(lambda a, b: (0, a[1] + b[1]),
298 items)[1])
299 else:
300 modes[char] = items[0]
301
302 # build a list of possible delimiters
303 modeList = modes.items()
304 total = float(chunkLength * iteration)
305 # (rows of consistent data) / (number of rows) = 100%
306 consistency = 1.0
307 # minimum consistency threshold
308 threshold = 0.9
309 while len(delims) == 0 and consistency >= threshold:
310 for k, v in modeList:
311 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000312 if ((v[1]/total) >= consistency and
313 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000314 delims[k] = v
315 consistency -= 0.01
316
317 if len(delims) == 1:
318 delim = delims.keys()[0]
319 skipinitialspace = (data[0].count(delim) ==
320 data[0].count("%c " % delim))
321 return (delim, skipinitialspace)
322
323 # analyze another chunkLength lines
324 start = end
325 end += chunkLength
326
327 if not delims:
328 return ('', 0)
329
330 # if there's more than one, fall back to a 'preferred' list
331 if len(delims) > 1:
332 for d in self.preferred:
333 if d in delims.keys():
334 skipinitialspace = (data[0].count(d) ==
335 data[0].count("%c " % d))
336 return (d, skipinitialspace)
337
Skip Montanaro39b29be2005-12-30 05:09:48 +0000338 # nothing else indicates a preference, pick the character that
339 # dominates(?)
340 items = [(v,k) for (k,v) in delims.items()]
341 items.sort()
342 delim = items[-1][1]
343
Skip Montanaro04ae7052003-04-24 20:21:31 +0000344 skipinitialspace = (data[0].count(delim) ==
345 data[0].count("%c " % delim))
346 return (delim, skipinitialspace)
347
348
Skip Montanaro1448d472003-04-25 14:47:16 +0000349 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000350 # Creates a dictionary of types of data in each column. If any
351 # column is of a single type (say, integers), *except* for the first
352 # row, then the first row is presumed to be labels. If the type
353 # can't be determined, it is assumed to be a string in which case
354 # the length of the string is the determining factor: if all of the
355 # rows except for the first are the same length, it's a header.
356 # Finally, a 'vote' is taken at the end for each column, adding or
357 # subtracting from the likelihood of the first row being a header.
358
Skip Montanaro1448d472003-04-25 14:47:16 +0000359 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000360
Skip Montanaro1448d472003-04-25 14:47:16 +0000361 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000362
363 columns = len(header)
364 columnTypes = {}
365 for i in range(columns): columnTypes[i] = None
366
367 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000368 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000369 # arbitrary number of rows to check, to keep it sane
370 if checked > 20:
371 break
372 checked += 1
373
374 if len(row) != columns:
375 continue # skip rows that have irregular number of columns
376
377 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000378
379 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000380 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000381 thisType(row[col])
382 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000383 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000384 pass
385 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000386 # fallback to length of string
387 thisType = len(row[col])
388
Raymond Hettinger39a55922003-06-12 03:01:55 +0000389 # treat longs as ints
390 if thisType == long:
391 thisType = int
392
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393 if thisType != columnTypes[col]:
394 if columnTypes[col] is None: # add new column type
395 columnTypes[col] = thisType
396 else:
397 # type is inconsistent, remove column from
398 # consideration
399 del columnTypes[col]
400
401 # finally, compare results against first row and "vote"
402 # on whether it's a header
403 hasHeader = 0
404 for col, colType in columnTypes.items():
405 if type(colType) == type(0): # it's a length
406 if len(header[col]) != colType:
407 hasHeader += 1
408 else:
409 hasHeader -= 1
410 else: # attempt typecast
411 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000412 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000413 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000414 hasHeader += 1
415 else:
416 hasHeader -= 1
417
418 return hasHeader > 0