blob: 45570f74c03d5362b3a90754ed6baa9cf2b538fb [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000051 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000052 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
Georg Brandla18af4e2007-04-21 15:47:16 +000082 def __next__(self):
83 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000084 if self.fieldnames is None:
85 self.fieldnames = row
Georg Brandla18af4e2007-04-21 15:47:16 +000086 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000087
Skip Montanaro04ae7052003-04-24 20:21:31 +000088 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
90 # values
91 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +000092 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +000093 d = dict(zip(self.fieldnames, row))
94 lf = len(self.fieldnames)
95 lr = len(row)
96 if lf < lr:
97 d[self.restkey] = row[lf:]
98 elif lf > lr:
99 for key in self.fieldnames[lr:]:
100 d[key] = self.restval
101 return d
102
103
104class DictWriter:
105 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000106 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 self.fieldnames = fieldnames # list of keys for the dict
108 self.restval = restval # for writing short dicts
109 if extrasaction.lower() not in ("raise", "ignore"):
110 raise ValueError, \
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
112 extrasaction)
113 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000114 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000115
116 def _dict_to_list(self, rowdict):
117 if self.extrasaction == "raise":
118 for k in rowdict.keys():
119 if k not in self.fieldnames:
120 raise ValueError, "dict contains fields not in fieldnames"
121 return [rowdict.get(key, self.restval) for key in self.fieldnames]
122
123 def writerow(self, rowdict):
124 return self.writer.writerow(self._dict_to_list(rowdict))
125
126 def writerows(self, rowdicts):
127 rows = []
128 for rowdict in rowdicts:
129 rows.append(self._dict_to_list(rowdict))
130 return self.writer.writerows(rows)
131
Raymond Hettinger39a55922003-06-12 03:01:55 +0000132# Guard Sniffer's type checking against builds that exclude complex()
133try:
134 complex
135except NameError:
136 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000137
138class Sniffer:
139 '''
140 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000141 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000142 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000143 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000144 # in case there is more than one possible delimiter
145 self.preferred = [',', '\t', ';', ' ', ':']
146
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147
Skip Montanaro77892372003-05-19 15:33:36 +0000148 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000150 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152
153 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000154 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000155 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000156 delimiter, skipinitialspace = self._guess_delimiter(sample,
157 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158
Skip Montanaro39b29be2005-12-30 05:09:48 +0000159 if not delimiter:
160 raise Error, "Could not determine delimiter"
161
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 _name = "sniffed"
164 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000165 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 # escapechar = ''
167 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 dialect.delimiter = delimiter
170 # _csv.reader won't accept a quotechar of ''
171 dialect.quotechar = quotechar or '"'
172 dialect.skipinitialspace = skipinitialspace
173
174 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175
176
Skip Montanaro77892372003-05-19 15:33:36 +0000177 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178 """
179 Looks for text enclosed between two identical quotes
180 (the probable quotechar) which are preceded and followed
181 by the same character (the probable delimiter).
182 For example:
183 ,'some text',
184 The quote with the most wins, same with the delimiter.
185 If there is no quotechar the delimiter can't be determined
186 this way.
187 """
188
189 matches = []
190 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
191 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
192 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
193 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000194 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000195 matches = regexp.findall(data)
196 if matches:
197 break
198
199 if not matches:
200 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
201
202 quotes = {}
203 delims = {}
204 spaces = 0
205 for m in matches:
206 n = regexp.groupindex['quote'] - 1
207 key = m[n]
208 if key:
209 quotes[key] = quotes.get(key, 0) + 1
210 try:
211 n = regexp.groupindex['delim'] - 1
212 key = m[n]
213 except KeyError:
214 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000215 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000216 delims[key] = delims.get(key, 0) + 1
217 try:
218 n = regexp.groupindex['space'] - 1
219 except KeyError:
220 continue
221 if m[n]:
222 spaces += 1
223
Guido van Rossum89da5d72006-08-22 00:21:25 +0000224 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000225
226 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000227 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000228 skipinitialspace = delims[delim] == spaces
229 if delim == '\n': # most likely a file with a single column
230 delim = ''
231 else:
232 # there is *no* delimiter, it's a single column of quoted data
233 delim = ''
234 skipinitialspace = 0
235
236 return (quotechar, delim, skipinitialspace)
237
238
Skip Montanaro77892372003-05-19 15:33:36 +0000239 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000240 """
241 The delimiter /should/ occur the same number of times on
242 each row. However, due to malformed data, it may not. We don't want
243 an all or nothing approach, so we allow for small variations in this
244 number.
245 1) build a table of the frequency of each character on every line.
246 2) build a table of freqencies of this frequency (meta-frequency?),
247 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
248 7 times in 2 rows'
249 3) use the mode of the meta-frequency to determine the /expected/
250 frequency for that character
251 4) find out how often the character actually meets that goal
252 5) the character that best meets its goal is the delimiter
253 For performance reasons, the data is evaluated in chunks, so it can
254 try and evaluate the smallest portion of the data possible, evaluating
255 additional chunks as necessary.
256 """
257
258 data = filter(None, data.split('\n'))
259
260 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
261
262 # build frequency tables
263 chunkLength = min(10, len(data))
264 iteration = 0
265 charFrequency = {}
266 modes = {}
267 delims = {}
268 start, end = 0, min(chunkLength, len(data))
269 while start < len(data):
270 iteration += 1
271 for line in data[start:end]:
272 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000273 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000274 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000275 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000276 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000277 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
278 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000279
280 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000281 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000282 if len(items) == 1 and items[0][0] == 0:
283 continue
284 # get the mode of the frequencies
285 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000286 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000287 # adjust the mode - subtract the sum of all
288 # other frequencies
289 items.remove(modes[char])
290 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000291 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000292 else:
293 modes[char] = items[0]
294
295 # build a list of possible delimiters
296 modeList = modes.items()
297 total = float(chunkLength * iteration)
298 # (rows of consistent data) / (number of rows) = 100%
299 consistency = 1.0
300 # minimum consistency threshold
301 threshold = 0.9
302 while len(delims) == 0 and consistency >= threshold:
303 for k, v in modeList:
304 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000305 if ((v[1]/total) >= consistency and
306 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000307 delims[k] = v
308 consistency -= 0.01
309
310 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000311 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000312 skipinitialspace = (data[0].count(delim) ==
313 data[0].count("%c " % delim))
314 return (delim, skipinitialspace)
315
316 # analyze another chunkLength lines
317 start = end
318 end += chunkLength
319
320 if not delims:
321 return ('', 0)
322
323 # if there's more than one, fall back to a 'preferred' list
324 if len(delims) > 1:
325 for d in self.preferred:
326 if d in delims.keys():
327 skipinitialspace = (data[0].count(d) ==
328 data[0].count("%c " % d))
329 return (d, skipinitialspace)
330
Skip Montanaro39b29be2005-12-30 05:09:48 +0000331 # nothing else indicates a preference, pick the character that
332 # dominates(?)
333 items = [(v,k) for (k,v) in delims.items()]
334 items.sort()
335 delim = items[-1][1]
336
Skip Montanaro04ae7052003-04-24 20:21:31 +0000337 skipinitialspace = (data[0].count(delim) ==
338 data[0].count("%c " % delim))
339 return (delim, skipinitialspace)
340
341
Skip Montanaro1448d472003-04-25 14:47:16 +0000342 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000343 # Creates a dictionary of types of data in each column. If any
344 # column is of a single type (say, integers), *except* for the first
345 # row, then the first row is presumed to be labels. If the type
346 # can't be determined, it is assumed to be a string in which case
347 # the length of the string is the determining factor: if all of the
348 # rows except for the first are the same length, it's a header.
349 # Finally, a 'vote' is taken at the end for each column, adding or
350 # subtracting from the likelihood of the first row being a header.
351
Skip Montanaro1448d472003-04-25 14:47:16 +0000352 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000353
Georg Brandla18af4e2007-04-21 15:47:16 +0000354 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000355
356 columns = len(header)
357 columnTypes = {}
358 for i in range(columns): columnTypes[i] = None
359
360 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000361 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000362 # arbitrary number of rows to check, to keep it sane
363 if checked > 20:
364 break
365 checked += 1
366
367 if len(row) != columns:
368 continue # skip rows that have irregular number of columns
369
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000370 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000371
Guido van Rossume2a383d2007-01-15 16:59:06 +0000372 for thisType in [int, int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000373 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000374 thisType(row[col])
375 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000376 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000377 pass
378 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000379 # fallback to length of string
380 thisType = len(row[col])
381
Raymond Hettinger39a55922003-06-12 03:01:55 +0000382 # treat longs as ints
Guido van Rossume2a383d2007-01-15 16:59:06 +0000383 if thisType == int:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000384 thisType = int
385
Skip Montanaro04ae7052003-04-24 20:21:31 +0000386 if thisType != columnTypes[col]:
387 if columnTypes[col] is None: # add new column type
388 columnTypes[col] = thisType
389 else:
390 # type is inconsistent, remove column from
391 # consideration
392 del columnTypes[col]
393
394 # finally, compare results against first row and "vote"
395 # on whether it's a header
396 hasHeader = 0
397 for col, colType in columnTypes.items():
398 if type(colType) == type(0): # it's a length
399 if len(header[col]) != colType:
400 hasHeader += 1
401 else:
402 hasHeader -= 1
403 else: # attempt typecast
404 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000405 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000406 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000407 hasHeader += 1
408 else:
409 hasHeader -= 1
410
411 return hasHeader > 0