blob: 6ee12c8b0fb0ec6749e4be3b198f318346908f4d [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Skip Montanaro1448d472003-04-25 14:47:16 +000014try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
Skip Montanaro04ae7052003-04-24 20:21:31 +000019__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000025 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000032 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000047
48 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 try:
50 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000051 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000052 # We do this for compatibility with py2.3
53 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000054
55class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000056 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000057 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000066 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000067 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000072 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000073 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000074 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000077 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
Georg Brandla18af4e2007-04-21 15:47:16 +000082 def __next__(self):
83 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000084 if self.fieldnames is None:
85 self.fieldnames = row
Georg Brandla18af4e2007-04-21 15:47:16 +000086 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000087
Skip Montanaro04ae7052003-04-24 20:21:31 +000088 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
90 # values
91 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +000092 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +000093 d = dict(zip(self.fieldnames, row))
94 lf = len(self.fieldnames)
95 lr = len(row)
96 if lf < lr:
97 d[self.restkey] = row[lf:]
98 elif lf > lr:
99 for key in self.fieldnames[lr:]:
100 d[key] = self.restval
101 return d
102
103
104class DictWriter:
105 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000106 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 self.fieldnames = fieldnames # list of keys for the dict
108 self.restval = restval # for writing short dicts
109 if extrasaction.lower() not in ("raise", "ignore"):
110 raise ValueError, \
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
112 extrasaction)
113 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000114 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000115
116 def _dict_to_list(self, rowdict):
117 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000118 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
119 if wrong_fields:
120 raise ValueError("dict contains fields not in fieldnames: " +
121 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000122 return [rowdict.get(key, self.restval) for key in self.fieldnames]
123
124 def writerow(self, rowdict):
125 return self.writer.writerow(self._dict_to_list(rowdict))
126
127 def writerows(self, rowdicts):
128 rows = []
129 for rowdict in rowdicts:
130 rows.append(self._dict_to_list(rowdict))
131 return self.writer.writerows(rows)
132
Raymond Hettinger39a55922003-06-12 03:01:55 +0000133# Guard Sniffer's type checking against builds that exclude complex()
134try:
135 complex
136except NameError:
137 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000138
139class Sniffer:
140 '''
141 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000142 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000143 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000144 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000145 # in case there is more than one possible delimiter
146 self.preferred = [',', '\t', ';', ' ', ':']
147
Skip Montanaro04ae7052003-04-24 20:21:31 +0000148
Skip Montanaro77892372003-05-19 15:33:36 +0000149 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000150 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000151 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000153
154 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000155 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000156 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000157 delimiter, skipinitialspace = self._guess_delimiter(sample,
158 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000159
Skip Montanaro39b29be2005-12-30 05:09:48 +0000160 if not delimiter:
161 raise Error, "Could not determine delimiter"
162
Skip Montanaro1448d472003-04-25 14:47:16 +0000163 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 _name = "sniffed"
165 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000166 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167 # escapechar = ''
168 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000169
Skip Montanaro1448d472003-04-25 14:47:16 +0000170 dialect.delimiter = delimiter
171 # _csv.reader won't accept a quotechar of ''
172 dialect.quotechar = quotechar or '"'
173 dialect.skipinitialspace = skipinitialspace
174
175 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176
177
Skip Montanaro77892372003-05-19 15:33:36 +0000178 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000179 """
180 Looks for text enclosed between two identical quotes
181 (the probable quotechar) which are preceded and followed
182 by the same character (the probable delimiter).
183 For example:
184 ,'some text',
185 The quote with the most wins, same with the delimiter.
186 If there is no quotechar the delimiter can't be determined
187 this way.
188 """
189
190 matches = []
191 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
192 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
193 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
194 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000195 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000196 matches = regexp.findall(data)
197 if matches:
198 break
199
200 if not matches:
201 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
202
203 quotes = {}
204 delims = {}
205 spaces = 0
206 for m in matches:
207 n = regexp.groupindex['quote'] - 1
208 key = m[n]
209 if key:
210 quotes[key] = quotes.get(key, 0) + 1
211 try:
212 n = regexp.groupindex['delim'] - 1
213 key = m[n]
214 except KeyError:
215 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000216 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000217 delims[key] = delims.get(key, 0) + 1
218 try:
219 n = regexp.groupindex['space'] - 1
220 except KeyError:
221 continue
222 if m[n]:
223 spaces += 1
224
Guido van Rossum89da5d72006-08-22 00:21:25 +0000225 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000226
227 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000228 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000229 skipinitialspace = delims[delim] == spaces
230 if delim == '\n': # most likely a file with a single column
231 delim = ''
232 else:
233 # there is *no* delimiter, it's a single column of quoted data
234 delim = ''
235 skipinitialspace = 0
236
237 return (quotechar, delim, skipinitialspace)
238
239
Skip Montanaro77892372003-05-19 15:33:36 +0000240 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000241 """
242 The delimiter /should/ occur the same number of times on
243 each row. However, due to malformed data, it may not. We don't want
244 an all or nothing approach, so we allow for small variations in this
245 number.
246 1) build a table of the frequency of each character on every line.
247 2) build a table of freqencies of this frequency (meta-frequency?),
248 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
249 7 times in 2 rows'
250 3) use the mode of the meta-frequency to determine the /expected/
251 frequency for that character
252 4) find out how often the character actually meets that goal
253 5) the character that best meets its goal is the delimiter
254 For performance reasons, the data is evaluated in chunks, so it can
255 try and evaluate the smallest portion of the data possible, evaluating
256 additional chunks as necessary.
257 """
258
259 data = filter(None, data.split('\n'))
260
261 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
262
263 # build frequency tables
264 chunkLength = min(10, len(data))
265 iteration = 0
266 charFrequency = {}
267 modes = {}
268 delims = {}
269 start, end = 0, min(chunkLength, len(data))
270 while start < len(data):
271 iteration += 1
272 for line in data[start:end]:
273 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000274 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000275 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000276 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000277 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000278 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
279 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000280
281 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000282 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000283 if len(items) == 1 and items[0][0] == 0:
284 continue
285 # get the mode of the frequencies
286 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000287 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000288 # adjust the mode - subtract the sum of all
289 # other frequencies
290 items.remove(modes[char])
291 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000292 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000293 else:
294 modes[char] = items[0]
295
296 # build a list of possible delimiters
297 modeList = modes.items()
298 total = float(chunkLength * iteration)
299 # (rows of consistent data) / (number of rows) = 100%
300 consistency = 1.0
301 # minimum consistency threshold
302 threshold = 0.9
303 while len(delims) == 0 and consistency >= threshold:
304 for k, v in modeList:
305 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000306 if ((v[1]/total) >= consistency and
307 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000308 delims[k] = v
309 consistency -= 0.01
310
311 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000312 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000313 skipinitialspace = (data[0].count(delim) ==
314 data[0].count("%c " % delim))
315 return (delim, skipinitialspace)
316
317 # analyze another chunkLength lines
318 start = end
319 end += chunkLength
320
321 if not delims:
322 return ('', 0)
323
324 # if there's more than one, fall back to a 'preferred' list
325 if len(delims) > 1:
326 for d in self.preferred:
327 if d in delims.keys():
328 skipinitialspace = (data[0].count(d) ==
329 data[0].count("%c " % d))
330 return (d, skipinitialspace)
331
Skip Montanaro39b29be2005-12-30 05:09:48 +0000332 # nothing else indicates a preference, pick the character that
333 # dominates(?)
334 items = [(v,k) for (k,v) in delims.items()]
335 items.sort()
336 delim = items[-1][1]
337
Skip Montanaro04ae7052003-04-24 20:21:31 +0000338 skipinitialspace = (data[0].count(delim) ==
339 data[0].count("%c " % delim))
340 return (delim, skipinitialspace)
341
342
Skip Montanaro1448d472003-04-25 14:47:16 +0000343 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000344 # Creates a dictionary of types of data in each column. If any
345 # column is of a single type (say, integers), *except* for the first
346 # row, then the first row is presumed to be labels. If the type
347 # can't be determined, it is assumed to be a string in which case
348 # the length of the string is the determining factor: if all of the
349 # rows except for the first are the same length, it's a header.
350 # Finally, a 'vote' is taken at the end for each column, adding or
351 # subtracting from the likelihood of the first row being a header.
352
Skip Montanaro1448d472003-04-25 14:47:16 +0000353 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000354
Georg Brandla18af4e2007-04-21 15:47:16 +0000355 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000356
357 columns = len(header)
358 columnTypes = {}
359 for i in range(columns): columnTypes[i] = None
360
361 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000362 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000363 # arbitrary number of rows to check, to keep it sane
364 if checked > 20:
365 break
366 checked += 1
367
368 if len(row) != columns:
369 continue # skip rows that have irregular number of columns
370
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000371 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000372
Guido van Rossume2a383d2007-01-15 16:59:06 +0000373 for thisType in [int, int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000374 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000375 thisType(row[col])
376 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000377 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000378 pass
379 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000380 # fallback to length of string
381 thisType = len(row[col])
382
Raymond Hettinger39a55922003-06-12 03:01:55 +0000383 # treat longs as ints
Guido van Rossume2a383d2007-01-15 16:59:06 +0000384 if thisType == int:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000385 thisType = int
386
Skip Montanaro04ae7052003-04-24 20:21:31 +0000387 if thisType != columnTypes[col]:
388 if columnTypes[col] is None: # add new column type
389 columnTypes[col] = thisType
390 else:
391 # type is inconsistent, remove column from
392 # consideration
393 del columnTypes[col]
394
395 # finally, compare results against first row and "vote"
396 # on whether it's a header
397 hasHeader = 0
398 for col, colType in columnTypes.items():
399 if type(colType) == type(0): # it's a length
400 if len(header[col]) != colType:
401 hasHeader += 1
402 else:
403 hasHeader -= 1
404 else: # attempt typecast
405 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000406 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000407 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000408 hasHeader += 1
409 else:
410 hasHeader -= 1
411
412 return hasHeader > 0