blob: 41026e03bdb2532a3744ee7a8e359276dab414fc [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Georg Brandl77354cf2008-03-21 20:01:51 +000078 self.dialect = dialect
79 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000080
81 def __iter__(self):
82 return self
83
84 def next(self):
85 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +000086 if self.fieldnames is None:
87 self.fieldnames = row
88 row = self.reader.next()
Georg Brandl77354cf2008-03-21 20:01:51 +000089 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +000090
Skip Montanaro04ae7052003-04-24 20:21:31 +000091 # unlike the basic reader, we prefer not to return blanks,
92 # because we will typically wind up with a dict full of None
93 # values
94 while row == []:
95 row = self.reader.next()
96 d = dict(zip(self.fieldnames, row))
97 lf = len(self.fieldnames)
98 lr = len(row)
99 if lf < lr:
100 d[self.restkey] = row[lf:]
101 elif lf > lr:
102 for key in self.fieldnames[lr:]:
103 d[key] = self.restval
104 return d
105
106
107class DictWriter:
108 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000109 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000110 self.fieldnames = fieldnames # list of keys for the dict
111 self.restval = restval # for writing short dicts
112 if extrasaction.lower() not in ("raise", "ignore"):
113 raise ValueError, \
114 ("extrasaction (%s) must be 'raise' or 'ignore'" %
115 extrasaction)
116 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000117 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000118
119 def _dict_to_list(self, rowdict):
120 if self.extrasaction == "raise":
Georg Brandl94fe3f52007-03-13 09:32:11 +0000121 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
122 if wrong_fields:
123 raise ValueError("dict contains fields not in fieldnames: " +
124 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000125 return [rowdict.get(key, self.restval) for key in self.fieldnames]
126
127 def writerow(self, rowdict):
128 return self.writer.writerow(self._dict_to_list(rowdict))
129
130 def writerows(self, rowdicts):
131 rows = []
132 for rowdict in rowdicts:
133 rows.append(self._dict_to_list(rowdict))
134 return self.writer.writerows(rows)
135
Raymond Hettinger39a55922003-06-12 03:01:55 +0000136# Guard Sniffer's type checking against builds that exclude complex()
137try:
138 complex
139except NameError:
140 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000141
142class Sniffer:
143 '''
144 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000145 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000146 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000147 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000148 # in case there is more than one possible delimiter
149 self.preferred = [',', '\t', ';', ' ', ':']
150
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151
Skip Montanaro77892372003-05-19 15:33:36 +0000152 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000153 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000154 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000155 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156
157 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000158 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000159 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000160 delimiter, skipinitialspace = self._guess_delimiter(sample,
161 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162
Skip Montanaro39b29be2005-12-30 05:09:48 +0000163 if not delimiter:
164 raise Error, "Could not determine delimiter"
165
Skip Montanaro1448d472003-04-25 14:47:16 +0000166 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167 _name = "sniffed"
168 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000169 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170 # escapechar = ''
171 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
Skip Montanaro1448d472003-04-25 14:47:16 +0000173 dialect.delimiter = delimiter
174 # _csv.reader won't accept a quotechar of ''
175 dialect.quotechar = quotechar or '"'
176 dialect.skipinitialspace = skipinitialspace
177
178 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179
180
Skip Montanaro77892372003-05-19 15:33:36 +0000181 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182 """
183 Looks for text enclosed between two identical quotes
184 (the probable quotechar) which are preceded and followed
185 by the same character (the probable delimiter).
186 For example:
187 ,'some text',
188 The quote with the most wins, same with the delimiter.
189 If there is no quotechar the delimiter can't be determined
190 this way.
191 """
192
193 matches = []
194 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
195 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
196 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
197 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000198 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000199 matches = regexp.findall(data)
200 if matches:
201 break
202
203 if not matches:
204 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
205
206 quotes = {}
207 delims = {}
208 spaces = 0
209 for m in matches:
210 n = regexp.groupindex['quote'] - 1
211 key = m[n]
212 if key:
213 quotes[key] = quotes.get(key, 0) + 1
214 try:
215 n = regexp.groupindex['delim'] - 1
216 key = m[n]
217 except KeyError:
218 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000219 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000220 delims[key] = delims.get(key, 0) + 1
221 try:
222 n = regexp.groupindex['space'] - 1
223 except KeyError:
224 continue
225 if m[n]:
226 spaces += 1
227
228 quotechar = reduce(lambda a, b, quotes = quotes:
229 (quotes[a] > quotes[b]) and a or b, quotes.keys())
230
231 if delims:
232 delim = reduce(lambda a, b, delims = delims:
233 (delims[a] > delims[b]) and a or b, delims.keys())
234 skipinitialspace = delims[delim] == spaces
235 if delim == '\n': # most likely a file with a single column
236 delim = ''
237 else:
238 # there is *no* delimiter, it's a single column of quoted data
239 delim = ''
240 skipinitialspace = 0
241
242 return (quotechar, delim, skipinitialspace)
243
244
Skip Montanaro77892372003-05-19 15:33:36 +0000245 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000246 """
247 The delimiter /should/ occur the same number of times on
248 each row. However, due to malformed data, it may not. We don't want
249 an all or nothing approach, so we allow for small variations in this
250 number.
251 1) build a table of the frequency of each character on every line.
252 2) build a table of freqencies of this frequency (meta-frequency?),
253 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
254 7 times in 2 rows'
255 3) use the mode of the meta-frequency to determine the /expected/
256 frequency for that character
257 4) find out how often the character actually meets that goal
258 5) the character that best meets its goal is the delimiter
259 For performance reasons, the data is evaluated in chunks, so it can
260 try and evaluate the smallest portion of the data possible, evaluating
261 additional chunks as necessary.
262 """
263
264 data = filter(None, data.split('\n'))
265
266 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
267
268 # build frequency tables
269 chunkLength = min(10, len(data))
270 iteration = 0
271 charFrequency = {}
272 modes = {}
273 delims = {}
274 start, end = 0, min(chunkLength, len(data))
275 while start < len(data):
276 iteration += 1
277 for line in data[start:end]:
278 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000279 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000280 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000281 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000282 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000283 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
284 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000285
286 for char in charFrequency.keys():
287 items = charFrequency[char].items()
288 if len(items) == 1 and items[0][0] == 0:
289 continue
290 # get the mode of the frequencies
291 if len(items) > 1:
292 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
293 items)
294 # adjust the mode - subtract the sum of all
295 # other frequencies
296 items.remove(modes[char])
297 modes[char] = (modes[char][0], modes[char][1]
298 - reduce(lambda a, b: (0, a[1] + b[1]),
299 items)[1])
300 else:
301 modes[char] = items[0]
302
303 # build a list of possible delimiters
304 modeList = modes.items()
305 total = float(chunkLength * iteration)
306 # (rows of consistent data) / (number of rows) = 100%
307 consistency = 1.0
308 # minimum consistency threshold
309 threshold = 0.9
310 while len(delims) == 0 and consistency >= threshold:
311 for k, v in modeList:
312 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000313 if ((v[1]/total) >= consistency and
314 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000315 delims[k] = v
316 consistency -= 0.01
317
318 if len(delims) == 1:
319 delim = delims.keys()[0]
320 skipinitialspace = (data[0].count(delim) ==
321 data[0].count("%c " % delim))
322 return (delim, skipinitialspace)
323
324 # analyze another chunkLength lines
325 start = end
326 end += chunkLength
327
328 if not delims:
329 return ('', 0)
330
331 # if there's more than one, fall back to a 'preferred' list
332 if len(delims) > 1:
333 for d in self.preferred:
334 if d in delims.keys():
335 skipinitialspace = (data[0].count(d) ==
336 data[0].count("%c " % d))
337 return (d, skipinitialspace)
338
Skip Montanaro39b29be2005-12-30 05:09:48 +0000339 # nothing else indicates a preference, pick the character that
340 # dominates(?)
341 items = [(v,k) for (k,v) in delims.items()]
342 items.sort()
343 delim = items[-1][1]
344
Skip Montanaro04ae7052003-04-24 20:21:31 +0000345 skipinitialspace = (data[0].count(delim) ==
346 data[0].count("%c " % delim))
347 return (delim, skipinitialspace)
348
349
Skip Montanaro1448d472003-04-25 14:47:16 +0000350 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000351 # Creates a dictionary of types of data in each column. If any
352 # column is of a single type (say, integers), *except* for the first
353 # row, then the first row is presumed to be labels. If the type
354 # can't be determined, it is assumed to be a string in which case
355 # the length of the string is the determining factor: if all of the
356 # rows except for the first are the same length, it's a header.
357 # Finally, a 'vote' is taken at the end for each column, adding or
358 # subtracting from the likelihood of the first row being a header.
359
Skip Montanaro1448d472003-04-25 14:47:16 +0000360 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000361
Skip Montanaro1448d472003-04-25 14:47:16 +0000362 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000363
364 columns = len(header)
365 columnTypes = {}
366 for i in range(columns): columnTypes[i] = None
367
368 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000369 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000370 # arbitrary number of rows to check, to keep it sane
371 if checked > 20:
372 break
373 checked += 1
374
375 if len(row) != columns:
376 continue # skip rows that have irregular number of columns
377
378 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000379
380 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000381 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000382 thisType(row[col])
383 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000384 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000385 pass
386 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000387 # fallback to length of string
388 thisType = len(row[col])
389
Raymond Hettinger39a55922003-06-12 03:01:55 +0000390 # treat longs as ints
391 if thisType == long:
392 thisType = int
393
Skip Montanaro04ae7052003-04-24 20:21:31 +0000394 if thisType != columnTypes[col]:
395 if columnTypes[col] is None: # add new column type
396 columnTypes[col] = thisType
397 else:
398 # type is inconsistent, remove column from
399 # consideration
400 del columnTypes[col]
401
402 # finally, compare results against first row and "vote"
403 # on whether it's a header
404 hasHeader = 0
405 for col, colType in columnTypes.items():
406 if type(colType) == type(0): # it's a length
407 if len(header[col]) != colType:
408 hasHeader += 1
409 else:
410 hasHeader -= 1
411 else: # attempt typecast
412 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000413 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000414 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000415 hasHeader += 1
416 else:
417 hasHeader -= 1
418
419 return hasHeader > 0