blob: 5ae5a730468ff6bc8a4bf4047973c56277f1900c [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannone8d0bf92008-08-18 00:51:19 +000017 "Error", "Dialect", "__doc__", "excel", "excel_tab",
18 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000019 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000023 """Describe an Excel dialect.
24
25 This must be subclassed (see csv.excel). Valid attributes are:
26 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
27 lineterminator, quoting.
28
29 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000030 _name = ""
31 _valid = False
32 # placeholders
33 delimiter = None
34 quotechar = None
35 escapechar = None
36 doublequote = None
37 skipinitialspace = None
38 lineterminator = None
39 quoting = None
40
41 def __init__(self):
42 if self.__class__ != Dialect:
43 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000044 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000045
46 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000047 try:
48 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000049 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000050 # We do this for compatibility with py2.3
51 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000052
53class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000054 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000055 delimiter = ','
56 quotechar = '"'
57 doublequote = True
58 skipinitialspace = False
59 lineterminator = '\r\n'
60 quoting = QUOTE_MINIMAL
61register_dialect("excel", excel)
62
63class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000064 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000065 delimiter = '\t'
66register_dialect("excel-tab", excel_tab)
67
68
69class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000070 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000071 dialect="excel", *args, **kwds):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000072 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000073 self.restkey = restkey # key to catch long rows
74 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 self.reader = reader(f, dialect, *args, **kwds)
Christian Heimes4fbc72b2008-03-22 00:47:35 +000076 self.dialect = dialect
77 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000082 @property
83 def fieldnames(self):
84 if self._fieldnames is None:
85 try:
86 self._fieldnames = next(self.reader)
87 except StopIteration:
88 pass
89 self.line_num = self.reader.line_num
90 return self._fieldnames
91
92 @fieldnames.setter
93 def fieldnames(self, value):
94 self._fieldnames = value
95
Georg Brandla18af4e2007-04-21 15:47:16 +000096 def __next__(self):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000097 if self.line_num == 0:
98 # Used only for its side effect.
99 self.fieldnames
Georg Brandla18af4e2007-04-21 15:47:16 +0000100 row = next(self.reader)
Christian Heimes4fbc72b2008-03-22 00:47:35 +0000101 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000102
Skip Montanaro04ae7052003-04-24 20:21:31 +0000103 # unlike the basic reader, we prefer not to return blanks,
104 # because we will typically wind up with a dict full of None
105 # values
106 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +0000107 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000108 d = dict(zip(self.fieldnames, row))
109 lf = len(self.fieldnames)
110 lr = len(row)
111 if lf < lr:
112 d[self.restkey] = row[lf:]
113 elif lf > lr:
114 for key in self.fieldnames[lr:]:
115 d[key] = self.restval
116 return d
117
118
119class DictWriter:
120 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000121 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000122 self.fieldnames = fieldnames # list of keys for the dict
123 self.restval = restval # for writing short dicts
124 if extrasaction.lower() not in ("raise", "ignore"):
Collin Winterce36ad82007-08-30 01:19:48 +0000125 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
126 % extrasaction)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000127 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000128 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000129
R. David Murraybe0698b2010-02-23 22:57:58 +0000130 def writeheader(self):
131 header = dict(zip(self.fieldnames, self.fieldnames))
132 self.writerow(header)
133
Skip Montanaro04ae7052003-04-24 20:21:31 +0000134 def _dict_to_list(self, rowdict):
135 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000136 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
137 if wrong_fields:
Collin Winterce36ad82007-08-30 01:19:48 +0000138 raise ValueError("dict contains fields not in fieldnames: "
139 + ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
141
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
144
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
150
Raymond Hettinger39a55922003-06-12 03:01:55 +0000151# Guard Sniffer's type checking against builds that exclude complex()
152try:
153 complex
154except NameError:
155 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156
157class Sniffer:
158 '''
159 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000160 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000162 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 # in case there is more than one possible delimiter
164 self.preferred = [',', '\t', ';', ' ', ':']
165
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
Skip Montanaro77892372003-05-19 15:33:36 +0000167 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000168 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000169 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000170 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000171
Benjamin Peterson4ac9ce42009-10-04 14:49:41 +0000172 quotechar, doublequote, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000173 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000174 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000175 delimiter, skipinitialspace = self._guess_delimiter(sample,
176 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177
Skip Montanaro39b29be2005-12-30 05:09:48 +0000178 if not delimiter:
Collin Winterce36ad82007-08-30 01:19:48 +0000179 raise Error("Could not determine delimiter")
Skip Montanaro39b29be2005-12-30 05:09:48 +0000180
Skip Montanaro1448d472003-04-25 14:47:16 +0000181 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182 _name = "sniffed"
183 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000184 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000185 # escapechar = ''
Skip Montanaro04ae7052003-04-24 20:21:31 +0000186
Benjamin Peterson4ac9ce42009-10-04 14:49:41 +0000187 dialect.doublequote = doublequote
Skip Montanaro1448d472003-04-25 14:47:16 +0000188 dialect.delimiter = delimiter
189 # _csv.reader won't accept a quotechar of ''
190 dialect.quotechar = quotechar or '"'
191 dialect.skipinitialspace = skipinitialspace
192
193 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000194
195
Skip Montanaro77892372003-05-19 15:33:36 +0000196 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000197 """
198 Looks for text enclosed between two identical quotes
199 (the probable quotechar) which are preceded and followed
200 by the same character (the probable delimiter).
201 For example:
202 ,'some text',
203 The quote with the most wins, same with the delimiter.
204 If there is no quotechar the delimiter can't be determined
205 this way.
206 """
207
208 matches = []
209 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
211 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
212 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000213 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000214 matches = regexp.findall(data)
215 if matches:
216 break
217
218 if not matches:
Benjamin Peterson4ac9ce42009-10-04 14:49:41 +0000219 # (quotechar, doublequote, delimiter, skipinitialspace)
220 return ('', False, None, 0)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000221 quotes = {}
222 delims = {}
223 spaces = 0
224 for m in matches:
225 n = regexp.groupindex['quote'] - 1
226 key = m[n]
227 if key:
228 quotes[key] = quotes.get(key, 0) + 1
229 try:
230 n = regexp.groupindex['delim'] - 1
231 key = m[n]
232 except KeyError:
233 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000234 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000235 delims[key] = delims.get(key, 0) + 1
236 try:
237 n = regexp.groupindex['space'] - 1
238 except KeyError:
239 continue
240 if m[n]:
241 spaces += 1
242
Guido van Rossum89da5d72006-08-22 00:21:25 +0000243 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000244
245 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000246 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000247 skipinitialspace = delims[delim] == spaces
248 if delim == '\n': # most likely a file with a single column
249 delim = ''
250 else:
251 # there is *no* delimiter, it's a single column of quoted data
252 delim = ''
253 skipinitialspace = 0
254
Benjamin Peterson4ac9ce42009-10-04 14:49:41 +0000255 # if we see an extra quote between delimiters, we've got a
256 # double quoted format
257 dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
258 {'delim':delim, 'quote':quotechar}, re.MULTILINE)
259
260
261
262 if dq_regexp.search(data):
263 doublequote = True
264 else:
265 doublequote = False
266
267 return (quotechar, doublequote, delim, skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000268
269
Skip Montanaro77892372003-05-19 15:33:36 +0000270 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000271 """
272 The delimiter /should/ occur the same number of times on
273 each row. However, due to malformed data, it may not. We don't want
274 an all or nothing approach, so we allow for small variations in this
275 number.
276 1) build a table of the frequency of each character on every line.
277 2) build a table of freqencies of this frequency (meta-frequency?),
278 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
279 7 times in 2 rows'
280 3) use the mode of the meta-frequency to determine the /expected/
281 frequency for that character
282 4) find out how often the character actually meets that goal
283 5) the character that best meets its goal is the delimiter
284 For performance reasons, the data is evaluated in chunks, so it can
285 try and evaluate the smallest portion of the data possible, evaluating
286 additional chunks as necessary.
287 """
288
Guido van Rossumc1f779c2007-07-03 08:25:58 +0000289 data = list(filter(None, data.split('\n')))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000290
291 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
292
293 # build frequency tables
294 chunkLength = min(10, len(data))
295 iteration = 0
296 charFrequency = {}
297 modes = {}
298 delims = {}
299 start, end = 0, min(chunkLength, len(data))
300 while start < len(data):
301 iteration += 1
302 for line in data[start:end]:
303 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000304 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000305 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000306 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000307 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000308 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
309 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000310
311 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000312 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000313 if len(items) == 1 and items[0][0] == 0:
314 continue
315 # get the mode of the frequencies
316 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000317 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000318 # adjust the mode - subtract the sum of all
319 # other frequencies
320 items.remove(modes[char])
321 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000322 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000323 else:
324 modes[char] = items[0]
325
326 # build a list of possible delimiters
327 modeList = modes.items()
328 total = float(chunkLength * iteration)
329 # (rows of consistent data) / (number of rows) = 100%
330 consistency = 1.0
331 # minimum consistency threshold
332 threshold = 0.9
333 while len(delims) == 0 and consistency >= threshold:
334 for k, v in modeList:
335 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000336 if ((v[1]/total) >= consistency and
337 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000338 delims[k] = v
339 consistency -= 0.01
340
341 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000342 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000343 skipinitialspace = (data[0].count(delim) ==
344 data[0].count("%c " % delim))
345 return (delim, skipinitialspace)
346
347 # analyze another chunkLength lines
348 start = end
349 end += chunkLength
350
351 if not delims:
352 return ('', 0)
353
354 # if there's more than one, fall back to a 'preferred' list
355 if len(delims) > 1:
356 for d in self.preferred:
357 if d in delims.keys():
358 skipinitialspace = (data[0].count(d) ==
359 data[0].count("%c " % d))
360 return (d, skipinitialspace)
361
Skip Montanaro39b29be2005-12-30 05:09:48 +0000362 # nothing else indicates a preference, pick the character that
363 # dominates(?)
364 items = [(v,k) for (k,v) in delims.items()]
365 items.sort()
366 delim = items[-1][1]
367
Skip Montanaro04ae7052003-04-24 20:21:31 +0000368 skipinitialspace = (data[0].count(delim) ==
369 data[0].count("%c " % delim))
370 return (delim, skipinitialspace)
371
372
Skip Montanaro1448d472003-04-25 14:47:16 +0000373 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000374 # Creates a dictionary of types of data in each column. If any
375 # column is of a single type (say, integers), *except* for the first
376 # row, then the first row is presumed to be labels. If the type
377 # can't be determined, it is assumed to be a string in which case
378 # the length of the string is the determining factor: if all of the
379 # rows except for the first are the same length, it's a header.
380 # Finally, a 'vote' is taken at the end for each column, adding or
381 # subtracting from the likelihood of the first row being a header.
382
Skip Montanaro1448d472003-04-25 14:47:16 +0000383 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000384
Georg Brandla18af4e2007-04-21 15:47:16 +0000385 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000386
387 columns = len(header)
388 columnTypes = {}
389 for i in range(columns): columnTypes[i] = None
390
391 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000392 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393 # arbitrary number of rows to check, to keep it sane
394 if checked > 20:
395 break
396 checked += 1
397
398 if len(row) != columns:
399 continue # skip rows that have irregular number of columns
400
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000401 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000402
Amaury Forgeot d'Arca4618732008-04-24 18:26:53 +0000403 for thisType in [int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000404 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000405 thisType(row[col])
406 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000407 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000408 pass
409 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000410 # fallback to length of string
411 thisType = len(row[col])
412
413 if thisType != columnTypes[col]:
414 if columnTypes[col] is None: # add new column type
415 columnTypes[col] = thisType
416 else:
417 # type is inconsistent, remove column from
418 # consideration
419 del columnTypes[col]
420
421 # finally, compare results against first row and "vote"
422 # on whether it's a header
423 hasHeader = 0
424 for col, colType in columnTypes.items():
425 if type(colType) == type(0): # it's a length
426 if len(header[col]) != colType:
427 hasHeader += 1
428 else:
429 hasHeader -= 1
430 else: # attempt typecast
431 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000432 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000433 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000434 hasHeader += 1
435 else:
436 hasHeader -= 1
437
438 return hasHeader > 0