blob: 37bdba524adacba5943940f08ed61b78d1579427 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
10 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000011from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000012
Skip Montanaro1448d472003-04-25 14:47:16 +000013try:
14 from cStringIO import StringIO
15except ImportError:
16 from StringIO import StringIO
17
Skip Montanaro04ae7052003-04-24 20:21:31 +000018__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
19 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
20 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
21 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
22
23class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000024 """Describe an Excel dialect.
25
26 This must be subclassed (see csv.excel). Valid attributes are:
27 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
28 lineterminator, quoting.
29
30 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000031 _name = ""
32 _valid = False
33 # placeholders
34 delimiter = None
35 quotechar = None
36 escapechar = None
37 doublequote = None
38 skipinitialspace = None
39 lineterminator = None
40 quoting = None
41
42 def __init__(self):
43 if self.__class__ != Dialect:
44 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000045 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000046
47 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000048 try:
49 _Dialect(self)
50 except TypeError, e:
51 # We do this for compatibility with py2.3
52 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000053
54class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000055 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000056 delimiter = ','
57 quotechar = '"'
58 doublequote = True
59 skipinitialspace = False
60 lineterminator = '\r\n'
61 quoting = QUOTE_MINIMAL
62register_dialect("excel", excel)
63
64class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000065 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000066 delimiter = '\t'
67register_dialect("excel-tab", excel_tab)
68
69
70class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000071 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000072 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000073 self.fieldnames = fieldnames # list of keys for the dict
74 self.restkey = restkey # key to catch long rows
75 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000076 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000077
78 def __iter__(self):
79 return self
80
81 def next(self):
82 row = self.reader.next()
Skip Montanarodffeed32003-10-03 14:03:01 +000083 if self.fieldnames is None:
84 self.fieldnames = row
85 row = self.reader.next()
86
Skip Montanaro04ae7052003-04-24 20:21:31 +000087 # unlike the basic reader, we prefer not to return blanks,
88 # because we will typically wind up with a dict full of None
89 # values
90 while row == []:
91 row = self.reader.next()
92 d = dict(zip(self.fieldnames, row))
93 lf = len(self.fieldnames)
94 lr = len(row)
95 if lf < lr:
96 d[self.restkey] = row[lf:]
97 elif lf > lr:
98 for key in self.fieldnames[lr:]:
99 d[key] = self.restval
100 return d
101
102
103class DictWriter:
104 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000105 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000106 self.fieldnames = fieldnames # list of keys for the dict
107 self.restval = restval # for writing short dicts
108 if extrasaction.lower() not in ("raise", "ignore"):
109 raise ValueError, \
110 ("extrasaction (%s) must be 'raise' or 'ignore'" %
111 extrasaction)
112 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000113 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000114
115 def _dict_to_list(self, rowdict):
116 if self.extrasaction == "raise":
117 for k in rowdict.keys():
118 if k not in self.fieldnames:
119 raise ValueError, "dict contains fields not in fieldnames"
120 return [rowdict.get(key, self.restval) for key in self.fieldnames]
121
122 def writerow(self, rowdict):
123 return self.writer.writerow(self._dict_to_list(rowdict))
124
125 def writerows(self, rowdicts):
126 rows = []
127 for rowdict in rowdicts:
128 rows.append(self._dict_to_list(rowdict))
129 return self.writer.writerows(rows)
130
Raymond Hettinger39a55922003-06-12 03:01:55 +0000131# Guard Sniffer's type checking against builds that exclude complex()
132try:
133 complex
134except NameError:
135 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000136
137class Sniffer:
138 '''
139 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000140 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000141 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000142 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000143 # in case there is more than one possible delimiter
144 self.preferred = [',', '\t', ';', ' ', ':']
145
Skip Montanaro04ae7052003-04-24 20:21:31 +0000146
Skip Montanaro77892372003-05-19 15:33:36 +0000147 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000148 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000149 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000150 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151
152 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000153 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000154 if delimiter is None:
Skip Montanaro77892372003-05-19 15:33:36 +0000155 delimiter, skipinitialspace = self._guess_delimiter(sample,
156 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000157
Skip Montanaro1448d472003-04-25 14:47:16 +0000158 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000159 _name = "sniffed"
160 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000161 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162 # escapechar = ''
163 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164
Skip Montanaro1448d472003-04-25 14:47:16 +0000165 dialect.delimiter = delimiter
166 # _csv.reader won't accept a quotechar of ''
167 dialect.quotechar = quotechar or '"'
168 dialect.skipinitialspace = skipinitialspace
169
170 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
172
Skip Montanaro77892372003-05-19 15:33:36 +0000173 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000174 """
175 Looks for text enclosed between two identical quotes
176 (the probable quotechar) which are preceded and followed
177 by the same character (the probable delimiter).
178 For example:
179 ,'some text',
180 The quote with the most wins, same with the delimiter.
181 If there is no quotechar the delimiter can't be determined
182 this way.
183 """
184
185 matches = []
186 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
187 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
188 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
189 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000190 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000191 matches = regexp.findall(data)
192 if matches:
193 break
194
195 if not matches:
196 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
197
198 quotes = {}
199 delims = {}
200 spaces = 0
201 for m in matches:
202 n = regexp.groupindex['quote'] - 1
203 key = m[n]
204 if key:
205 quotes[key] = quotes.get(key, 0) + 1
206 try:
207 n = regexp.groupindex['delim'] - 1
208 key = m[n]
209 except KeyError:
210 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000211 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000212 delims[key] = delims.get(key, 0) + 1
213 try:
214 n = regexp.groupindex['space'] - 1
215 except KeyError:
216 continue
217 if m[n]:
218 spaces += 1
219
220 quotechar = reduce(lambda a, b, quotes = quotes:
221 (quotes[a] > quotes[b]) and a or b, quotes.keys())
222
223 if delims:
224 delim = reduce(lambda a, b, delims = delims:
225 (delims[a] > delims[b]) and a or b, delims.keys())
226 skipinitialspace = delims[delim] == spaces
227 if delim == '\n': # most likely a file with a single column
228 delim = ''
229 else:
230 # there is *no* delimiter, it's a single column of quoted data
231 delim = ''
232 skipinitialspace = 0
233
234 return (quotechar, delim, skipinitialspace)
235
236
Skip Montanaro77892372003-05-19 15:33:36 +0000237 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000238 """
239 The delimiter /should/ occur the same number of times on
240 each row. However, due to malformed data, it may not. We don't want
241 an all or nothing approach, so we allow for small variations in this
242 number.
243 1) build a table of the frequency of each character on every line.
244 2) build a table of freqencies of this frequency (meta-frequency?),
245 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
246 7 times in 2 rows'
247 3) use the mode of the meta-frequency to determine the /expected/
248 frequency for that character
249 4) find out how often the character actually meets that goal
250 5) the character that best meets its goal is the delimiter
251 For performance reasons, the data is evaluated in chunks, so it can
252 try and evaluate the smallest portion of the data possible, evaluating
253 additional chunks as necessary.
254 """
255
256 data = filter(None, data.split('\n'))
257
258 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
259
260 # build frequency tables
261 chunkLength = min(10, len(data))
262 iteration = 0
263 charFrequency = {}
264 modes = {}
265 delims = {}
266 start, end = 0, min(chunkLength, len(data))
267 while start < len(data):
268 iteration += 1
269 for line in data[start:end]:
270 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000271 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000272 # must count even if frequency is 0
273 freq = line.strip().count(char)
274 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000275 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
276 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000277
278 for char in charFrequency.keys():
279 items = charFrequency[char].items()
280 if len(items) == 1 and items[0][0] == 0:
281 continue
282 # get the mode of the frequencies
283 if len(items) > 1:
284 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
285 items)
286 # adjust the mode - subtract the sum of all
287 # other frequencies
288 items.remove(modes[char])
289 modes[char] = (modes[char][0], modes[char][1]
290 - reduce(lambda a, b: (0, a[1] + b[1]),
291 items)[1])
292 else:
293 modes[char] = items[0]
294
295 # build a list of possible delimiters
296 modeList = modes.items()
297 total = float(chunkLength * iteration)
298 # (rows of consistent data) / (number of rows) = 100%
299 consistency = 1.0
300 # minimum consistency threshold
301 threshold = 0.9
302 while len(delims) == 0 and consistency >= threshold:
303 for k, v in modeList:
304 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000305 if ((v[1]/total) >= consistency and
306 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000307 delims[k] = v
308 consistency -= 0.01
309
310 if len(delims) == 1:
311 delim = delims.keys()[0]
312 skipinitialspace = (data[0].count(delim) ==
313 data[0].count("%c " % delim))
314 return (delim, skipinitialspace)
315
316 # analyze another chunkLength lines
317 start = end
318 end += chunkLength
319
320 if not delims:
321 return ('', 0)
322
323 # if there's more than one, fall back to a 'preferred' list
324 if len(delims) > 1:
325 for d in self.preferred:
326 if d in delims.keys():
327 skipinitialspace = (data[0].count(d) ==
328 data[0].count("%c " % d))
329 return (d, skipinitialspace)
330
331 # finally, just return the first damn character in the list
332 delim = delims.keys()[0]
333 skipinitialspace = (data[0].count(delim) ==
334 data[0].count("%c " % delim))
335 return (delim, skipinitialspace)
336
337
Skip Montanaro1448d472003-04-25 14:47:16 +0000338 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000339 # Creates a dictionary of types of data in each column. If any
340 # column is of a single type (say, integers), *except* for the first
341 # row, then the first row is presumed to be labels. If the type
342 # can't be determined, it is assumed to be a string in which case
343 # the length of the string is the determining factor: if all of the
344 # rows except for the first are the same length, it's a header.
345 # Finally, a 'vote' is taken at the end for each column, adding or
346 # subtracting from the likelihood of the first row being a header.
347
Skip Montanaro1448d472003-04-25 14:47:16 +0000348 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000349
Skip Montanaro1448d472003-04-25 14:47:16 +0000350 header = rdr.next() # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000351
352 columns = len(header)
353 columnTypes = {}
354 for i in range(columns): columnTypes[i] = None
355
356 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000357 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000358 # arbitrary number of rows to check, to keep it sane
359 if checked > 20:
360 break
361 checked += 1
362
363 if len(row) != columns:
364 continue # skip rows that have irregular number of columns
365
366 for col in columnTypes.keys():
Raymond Hettinger39a55922003-06-12 03:01:55 +0000367
368 for thisType in [int, long, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000369 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000370 thisType(row[col])
371 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000372 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000373 pass
374 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000375 # fallback to length of string
376 thisType = len(row[col])
377
Raymond Hettinger39a55922003-06-12 03:01:55 +0000378 # treat longs as ints
379 if thisType == long:
380 thisType = int
381
Skip Montanaro04ae7052003-04-24 20:21:31 +0000382 if thisType != columnTypes[col]:
383 if columnTypes[col] is None: # add new column type
384 columnTypes[col] = thisType
385 else:
386 # type is inconsistent, remove column from
387 # consideration
388 del columnTypes[col]
389
390 # finally, compare results against first row and "vote"
391 # on whether it's a header
392 hasHeader = 0
393 for col, colType in columnTypes.items():
394 if type(colType) == type(0): # it's a length
395 if len(header[col]) != colType:
396 hasHeader += 1
397 else:
398 hasHeader -= 1
399 else: # attempt typecast
400 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000401 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000402 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000403 hasHeader += 1
404 else:
405 hasHeader -= 1
406
407 return hasHeader > 0