blob: fcb8a8fedcea790cb4c1759d80142a691d8fedfc [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
11
Skip Montanaro1448d472003-04-25 14:47:16 +000012try:
13 from cStringIO import StringIO
14except ImportError:
15 from StringIO import StringIO
16
Skip Montanaro04ae7052003-04-24 20:21:31 +000017__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
23 _name = ""
24 _valid = False
25 # placeholders
26 delimiter = None
27 quotechar = None
28 escapechar = None
29 doublequote = None
30 skipinitialspace = None
31 lineterminator = None
32 quoting = None
33
34 def __init__(self):
35 if self.__class__ != Dialect:
36 self._valid = True
37 errors = self._validate()
38 if errors != []:
39 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
40
41 def _validate(self):
42 errors = []
43 if not self._valid:
44 errors.append("can't directly instantiate Dialect class")
45
46 if self.delimiter is None:
47 errors.append("delimiter character not set")
48 elif (not isinstance(self.delimiter, str) or
49 len(self.delimiter) > 1):
50 errors.append("delimiter must be one-character string")
51
52 if self.quotechar is None:
53 if self.quoting != QUOTE_NONE:
54 errors.append("quotechar not set")
55 elif (not isinstance(self.quotechar, str) or
56 len(self.quotechar) > 1):
57 errors.append("quotechar must be one-character string")
58
59 if self.lineterminator is None:
60 errors.append("lineterminator not set")
61 elif not isinstance(self.lineterminator, str):
62 errors.append("lineterminator must be a string")
63
64 if self.doublequote not in (True, False):
65 errors.append("doublequote parameter must be True or False")
66
67 if self.skipinitialspace not in (True, False):
68 errors.append("skipinitialspace parameter must be True or False")
69
70 if self.quoting is None:
71 errors.append("quoting parameter not set")
72
73 if self.quoting is QUOTE_NONE:
74 if (not isinstance(self.escapechar, (unicode, str)) or
75 len(self.escapechar) > 1):
76 errors.append("escapechar must be a one-character string or unicode object")
77
78 return errors
79
80class excel(Dialect):
81 delimiter = ','
82 quotechar = '"'
83 doublequote = True
84 skipinitialspace = False
85 lineterminator = '\r\n'
86 quoting = QUOTE_MINIMAL
87register_dialect("excel", excel)
88
89class excel_tab(excel):
90 delimiter = '\t'
91register_dialect("excel-tab", excel_tab)
92
93
94class DictReader:
95 def __init__(self, f, fieldnames, restkey=None, restval=None,
96 dialect="excel", *args):
97 self.fieldnames = fieldnames # list of keys for the dict
98 self.restkey = restkey # key to catch long rows
99 self.restval = restval # default value for short rows
100 self.reader = reader(f, dialect, *args)
101
102 def __iter__(self):
103 return self
104
105 def next(self):
106 row = self.reader.next()
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125 dialect="excel", *args):
126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
133 self.writer = writer(f, dialect, *args)
134
135 def _dict_to_list(self, rowdict):
136 if self.extrasaction == "raise":
137 for k in rowdict.keys():
138 if k not in self.fieldnames:
139 raise ValueError, "dict contains fields not in fieldnames"
140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
141
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
144
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
150
Raymond Hettinger39a55922003-06-12 03:01:55 +0000151# Guard Sniffer's type checking against builds that exclude complex()
152try:
153 complex
154except NameError:
155 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156
157class Sniffer:
158 '''
159 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000160 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 # in case there is more than one possible delimiter
164 self.preferred = [',', '\t', ';', ' ', ':']
165
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
Skip Montanaro77892372003-05-19 15:33:36 +0000167 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
172 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000173 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000175 delimiter, skipinitialspace = self._guess_delimiter(sample,
176 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177
Skip Montanaro1448d472003-04-25 14:47:16 +0000178 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179 _name = "sniffed"
180 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000181 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182 # escapechar = ''
183 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000184
Skip Montanaro1448d472003-04-25 14:47:16 +0000185 dialect.delimiter = delimiter
186 # _csv.reader won't accept a quotechar of ''
187 dialect.quotechar = quotechar or '"'
188 dialect.skipinitialspace = skipinitialspace
189
190 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000191
192
Skip Montanaro77892372003-05-19 15:33:36 +0000193 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000194 """
195 Looks for text enclosed between two identical quotes
196 (the probable quotechar) which are preceded and followed
197 by the same character (the probable delimiter).
198 For example:
199 ,'some text',
200 The quote with the most wins, same with the delimiter.
201 If there is no quotechar the delimiter can't be determined
202 this way.
203 """
204
205 matches = []
206 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
207 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
208 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
209 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000210 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000211 matches = regexp.findall(data)
212 if matches:
213 break
214
215 if not matches:
216 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
217
218 quotes = {}
219 delims = {}
220 spaces = 0
221 for m in matches:
222 n = regexp.groupindex['quote'] - 1
223 key = m[n]
224 if key:
225 quotes[key] = quotes.get(key, 0) + 1
226 try:
227 n = regexp.groupindex['delim'] - 1
228 key = m[n]
229 except KeyError:
230 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000231 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000232 delims[key] = delims.get(key, 0) + 1
233 try:
234 n = regexp.groupindex['space'] - 1
235 except KeyError:
236 continue
237 if m[n]:
238 spaces += 1
239
240 quotechar = reduce(lambda a, b, quotes = quotes:
241 (quotes[a] > quotes[b]) and a or b, quotes.keys())
242
243 if delims:
244 delim = reduce(lambda a, b, delims = delims:
245 (delims[a] > delims[b]) and a or b, delims.keys())
246 skipinitialspace = delims[delim] == spaces
247 if delim == '\n': # most likely a file with a single column
248 delim = ''
249 else:
250 # there is *no* delimiter, it's a single column of quoted data
251 delim = ''
252 skipinitialspace = 0
253
254 return (quotechar, delim, skipinitialspace)
255
256
Skip Montanaro77892372003-05-19 15:33:36 +0000257 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000258 """
259 The delimiter /should/ occur the same number of times on
260 each row. However, due to malformed data, it may not. We don't want
261 an all or nothing approach, so we allow for small variations in this
262 number.
263 1) build a table of the frequency of each character on every line.
264 2) build a table of freqencies of this frequency (meta-frequency?),
265 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
266 7 times in 2 rows'
267 3) use the mode of the meta-frequency to determine the /expected/
268 frequency for that character
269 4) find out how often the character actually meets that goal
270 5) the character that best meets its goal is the delimiter
271 For performance reasons, the data is evaluated in chunks, so it can
272 try and evaluate the smallest portion of the data possible, evaluating
273 additional chunks as necessary.
274 """
275
276 data = filter(None, data.split('\n'))
277
278 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
279
280 # build frequency tables
281 chunkLength = min(10, len(data))
282 iteration = 0
283 charFrequency = {}
284 modes = {}
285 delims = {}
286 start, end = 0, min(chunkLength, len(data))
287 while start < len(data):
288 iteration += 1
289 for line in data[start:end]:
290 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000291 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000292 # must count even if frequency is 0
293 freq = line.strip().count(char)
294 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000295 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
296 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000297
298 for char in charFrequency.keys():
299 items = charFrequency[char].items()
300 if len(items) == 1 and items[0][0] == 0:
301 continue
302 # get the mode of the frequencies
303 if len(items) > 1:
304 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
305 items)
306 # adjust the mode - subtract the sum of all
307 # other frequencies
308 items.remove(modes[char])
309 modes[char] = (modes[char][0], modes[char][1]
310 - reduce(lambda a, b: (0, a[1] + b[1]),
311 items)[1])
312 else:
313 modes[char] = items[0]
314
315 # build a list of possible delimiters
316 modeList = modes.items()
317 total = float(chunkLength * iteration)
318 # (rows of consistent data) / (number of rows) = 100%
319 consistency = 1.0
320 # minimum consistency threshold
321 threshold = 0.9
322 while len(delims) == 0 and consistency >= threshold:
323 for k, v in modeList:
324 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000325 if ((v[1]/total) >= consistency and
326 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000327 delims[k] = v
328 consistency -= 0.01
329
330 if len(delims) == 1:
331 delim = delims.keys()[0]
332 skipinitialspace = (data[0].count(delim) ==
333 data[0].count("%c " % delim))
334 return (delim, skipinitialspace)
335
336 # analyze another chunkLength lines
337 start = end
338 end += chunkLength
339
340 if not delims:
341 return ('', 0)
342
343 # if there's more than one, fall back to a 'preferred' list
344 if len(delims) > 1:
345 for d in self.preferred:
346 if d in delims.keys():
347 skipinitialspace = (data[0].count(d) ==
348 data[0].count("%c " % d))
349 return (d, skipinitialspace)
350
351 # finally, just return the first damn character in the list
352 delim = delims.keys()[0]
353 skipinitialspace = (data[0].count(delim) ==
354 data[0].count("%c " % delim))
355 return (delim, skipinitialspace)
356
357
Skip Montanaro1448d472003-04-25 14:47:16 +0000358 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000359 # Creates a dictionary of types of data in each column. If any
360 # column is of a single type (say, integers), *except* for the first
361 # row, then the first row is presumed to be labels. If the type
362 # can't be determined, it is assumed to be a string in which case
363 # the length of the string is the determining factor: if all of the
364 # rows except for the first are the same length, it's a header.
365 # Finally, a 'vote' is taken at the end for each column, adding or
366 # subtracting from the likelihood of the first row being a header.
367
Skip Montanaro1448d472003-04-25 14:47:16 +0000368 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000369
Skip Montanaro1448d472003-04-25 14:47:16 +0000370 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000371
372 columns = len(header)
373 columnTypes = {}
374 for i in range(columns): columnTypes[i] = None
375
376 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000377 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000378 # arbitrary number of rows to check, to keep it sane
379 if checked > 20:
380 break
381 checked += 1
382
383 if len(row) != columns:
384 continue # skip rows that have irregular number of columns
385
386 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000387
388 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000389 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000390 thisType(row[col])
391 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000392 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000393 pass
394 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000395 # fallback to length of string
396 thisType = len(row[col])
397
Raymond Hettinger39a55922003-06-12 03:01:55 +0000398 # treat longs as ints
399 if thisType == long:
400 thisType = int
401
Skip Montanaro04ae7052003-04-24 20:21:31 +0000402 if thisType != columnTypes[col]:
403 if columnTypes[col] is None: # add new column type
404 columnTypes[col] = thisType
405 else:
406 # type is inconsistent, remove column from
407 # consideration
408 del columnTypes[col]
409
410 # finally, compare results against first row and "vote"
411 # on whether it's a header
412 hasHeader = 0
413 for col, colType in columnTypes.items():
414 if type(colType) == type(0): # it's a length
415 if len(header[col]) != colType:
416 hasHeader += 1
417 else:
418 hasHeader -= 1
419 else: # attempt typecast
420 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000421 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000422 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000423 hasHeader += 1
424 else:
425 hasHeader -= 1
426
427 return hasHeader > 0