blob: 14b4d17c2342b9b11121d41683f1fb8b4c2230ff [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
82 def next(self):
83 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +000084 if self.fieldnames is None:
85 self.fieldnames = row
86 row = self.reader.next()
87
Skip Montanaro04ae7052003-04-24 20:21:31 +000088 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
90 # values
91 while row == []:
92 row = self.reader.next()
93 d = dict(zip(self.fieldnames, row))
94 lf = len(self.fieldnames)
95 lr = len(row)
96 if lf < lr:
97 d[self.restkey] = row[lf:]
98 elif lf > lr:
99 for key in self.fieldnames[lr:]:
100 d[key] = self.restval
101 return d
102
103
104class DictWriter:
105 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000106 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 self.fieldnames = fieldnames # list of keys for the dict
108 self.restval = restval # for writing short dicts
109 if extrasaction.lower() not in ("raise", "ignore"):
110 raise ValueError, \
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
112 extrasaction)
113 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000114 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000115
116 def _dict_to_list(self, rowdict):
117 if self.extrasaction == "raise":
118 for k in rowdict.keys():
119 if k not in self.fieldnames:
120 raise ValueError, "dict contains fields not in fieldnames"
121 return [rowdict.get(key, self.restval) for key in self.fieldnames]
122
123 def writerow(self, rowdict):
124 return self.writer.writerow(self._dict_to_list(rowdict))
125
126 def writerows(self, rowdicts):
127 rows = []
128 for rowdict in rowdicts:
129 rows.append(self._dict_to_list(rowdict))
130 return self.writer.writerows(rows)
131
Raymond Hettinger39a55922003-06-12 03:01:55 +0000132# Guard Sniffer's type checking against builds that exclude complex()
133try:
134 complex
135except NameError:
136 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000137
138class Sniffer:
139 '''
140 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000141 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000142 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000143 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000144 # in case there is more than one possible delimiter
145 self.preferred = [',', '\t', ';', ' ', ':']
146
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147
Skip Montanaro77892372003-05-19 15:33:36 +0000148 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000150 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152
153 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000154 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000155 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000156 delimiter, skipinitialspace = self._guess_delimiter(sample,
157 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158
Skip Montanaro1448d472003-04-25 14:47:16 +0000159 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000160 _name = "sniffed"
161 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000162 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 # escapechar = ''
164 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165
Skip Montanaro1448d472003-04-25 14:47:16 +0000166 dialect.delimiter = delimiter
167 # _csv.reader won't accept a quotechar of ''
168 dialect.quotechar = quotechar or '"'
169 dialect.skipinitialspace = skipinitialspace
170
171 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
173
Skip Montanaro77892372003-05-19 15:33:36 +0000174 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175 """
176 Looks for text enclosed between two identical quotes
177 (the probable quotechar) which are preceded and followed
178 by the same character (the probable delimiter).
179 For example:
180 ,'some text',
181 The quote with the most wins, same with the delimiter.
182 If there is no quotechar the delimiter can't be determined
183 this way.
184 """
185
186 matches = []
187 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
188 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
189 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
190 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000191 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000192 matches = regexp.findall(data)
193 if matches:
194 break
195
196 if not matches:
197 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
198
199 quotes = {}
200 delims = {}
201 spaces = 0
202 for m in matches:
203 n = regexp.groupindex['quote'] - 1
204 key = m[n]
205 if key:
206 quotes[key] = quotes.get(key, 0) + 1
207 try:
208 n = regexp.groupindex['delim'] - 1
209 key = m[n]
210 except KeyError:
211 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000212 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000213 delims[key] = delims.get(key, 0) + 1
214 try:
215 n = regexp.groupindex['space'] - 1
216 except KeyError:
217 continue
218 if m[n]:
219 spaces += 1
220
221 quotechar = reduce(lambda a, b, quotes = quotes:
222 (quotes[a] > quotes[b]) and a or b, quotes.keys())
223
224 if delims:
225 delim = reduce(lambda a, b, delims = delims:
226 (delims[a] > delims[b]) and a or b, delims.keys())
227 skipinitialspace = delims[delim] == spaces
228 if delim == '\n': # most likely a file with a single column
229 delim = ''
230 else:
231 # there is *no* delimiter, it's a single column of quoted data
232 delim = ''
233 skipinitialspace = 0
234
235 return (quotechar, delim, skipinitialspace)
236
237
Skip Montanaro77892372003-05-19 15:33:36 +0000238 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000239 """
240 The delimiter /should/ occur the same number of times on
241 each row. However, due to malformed data, it may not. We don't want
242 an all or nothing approach, so we allow for small variations in this
243 number.
244 1) build a table of the frequency of each character on every line.
245 2) build a table of freqencies of this frequency (meta-frequency?),
246 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
247 7 times in 2 rows'
248 3) use the mode of the meta-frequency to determine the /expected/
249 frequency for that character
250 4) find out how often the character actually meets that goal
251 5) the character that best meets its goal is the delimiter
252 For performance reasons, the data is evaluated in chunks, so it can
253 try and evaluate the smallest portion of the data possible, evaluating
254 additional chunks as necessary.
255 """
256
257 data = filter(None, data.split('\n'))
258
259 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
260
261 # build frequency tables
262 chunkLength = min(10, len(data))
263 iteration = 0
264 charFrequency = {}
265 modes = {}
266 delims = {}
267 start, end = 0, min(chunkLength, len(data))
268 while start < len(data):
269 iteration += 1
270 for line in data[start:end]:
271 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000272 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000273 # must count even if frequency is 0
274 freq = line.strip().count(char)
275 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000276 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
277 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000278
279 for char in charFrequency.keys():
280 items = charFrequency[char].items()
281 if len(items) == 1 and items[0][0] == 0:
282 continue
283 # get the mode of the frequencies
284 if len(items) > 1:
285 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
286 items)
287 # adjust the mode - subtract the sum of all
288 # other frequencies
289 items.remove(modes[char])
290 modes[char] = (modes[char][0], modes[char][1]
291 - reduce(lambda a, b: (0, a[1] + b[1]),
292 items)[1])
293 else:
294 modes[char] = items[0]
295
296 # build a list of possible delimiters
297 modeList = modes.items()
298 total = float(chunkLength * iteration)
299 # (rows of consistent data) / (number of rows) = 100%
300 consistency = 1.0
301 # minimum consistency threshold
302 threshold = 0.9
303 while len(delims) == 0 and consistency >= threshold:
304 for k, v in modeList:
305 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000306 if ((v[1]/total) >= consistency and
307 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000308 delims[k] = v
309 consistency -= 0.01
310
311 if len(delims) == 1:
312 delim = delims.keys()[0]
313 skipinitialspace = (data[0].count(delim) ==
314 data[0].count("%c " % delim))
315 return (delim, skipinitialspace)
316
317 # analyze another chunkLength lines
318 start = end
319 end += chunkLength
320
321 if not delims:
322 return ('', 0)
323
324 # if there's more than one, fall back to a 'preferred' list
325 if len(delims) > 1:
326 for d in self.preferred:
327 if d in delims.keys():
328 skipinitialspace = (data[0].count(d) ==
329 data[0].count("%c " % d))
330 return (d, skipinitialspace)
331
332 # finally, just return the first damn character in the list
333 delim = delims.keys()[0]
334 skipinitialspace = (data[0].count(delim) ==
335 data[0].count("%c " % delim))
336 return (delim, skipinitialspace)
337
338
Skip Montanaro1448d472003-04-25 14:47:16 +0000339 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000340 # Creates a dictionary of types of data in each column. If any
341 # column is of a single type (say, integers), *except* for the first
342 # row, then the first row is presumed to be labels. If the type
343 # can't be determined, it is assumed to be a string in which case
344 # the length of the string is the determining factor: if all of the
345 # rows except for the first are the same length, it's a header.
346 # Finally, a 'vote' is taken at the end for each column, adding or
347 # subtracting from the likelihood of the first row being a header.
348
Skip Montanaro1448d472003-04-25 14:47:16 +0000349 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000350
Skip Montanaro1448d472003-04-25 14:47:16 +0000351 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000352
353 columns = len(header)
354 columnTypes = {}
355 for i in range(columns): columnTypes[i] = None
356
357 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000358 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000359 # arbitrary number of rows to check, to keep it sane
360 if checked > 20:
361 break
362 checked += 1
363
364 if len(row) != columns:
365 continue # skip rows that have irregular number of columns
366
367 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000368
369 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000370 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000371 thisType(row[col])
372 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000373 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000374 pass
375 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000376 # fallback to length of string
377 thisType = len(row[col])
378
Raymond Hettinger39a55922003-06-12 03:01:55 +0000379 # treat longs as ints
380 if thisType == long:
381 thisType = int
382
Skip Montanaro04ae7052003-04-24 20:21:31 +0000383 if thisType != columnTypes[col]:
384 if columnTypes[col] is None: # add new column type
385 columnTypes[col] = thisType
386 else:
387 # type is inconsistent, remove column from
388 # consideration
389 del columnTypes[col]
390
391 # finally, compare results against first row and "vote"
392 # on whether it's a header
393 hasHeader = 0
394 for col, colType in columnTypes.items():
395 if type(colType) == type(0): # it's a length
396 if len(header[col]) != colType:
397 hasHeader += 1
398 else:
399 hasHeader -= 1
400 else: # attempt typecast
401 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000402 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000403 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000404 hasHeader += 1
405 else:
406 hasHeader -= 1
407
408 return hasHeader > 0