blob: dbe6db7eadb49bf6c5780a4fbbed9ce70c9146b5 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannone8d0bf92008-08-18 00:51:19 +000017 "Error", "Dialect", "__doc__", "excel", "excel_tab",
18 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000019 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000023 """Describe an Excel dialect.
24
25 This must be subclassed (see csv.excel). Valid attributes are:
26 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
27 lineterminator, quoting.
28
29 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000030 _name = ""
31 _valid = False
32 # placeholders
33 delimiter = None
34 quotechar = None
35 escapechar = None
36 doublequote = None
37 skipinitialspace = None
38 lineterminator = None
39 quoting = None
40
41 def __init__(self):
42 if self.__class__ != Dialect:
43 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000044 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000045
46 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000047 try:
48 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000049 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000050 # We do this for compatibility with py2.3
51 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000052
53class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000054 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000055 delimiter = ','
56 quotechar = '"'
57 doublequote = True
58 skipinitialspace = False
59 lineterminator = '\r\n'
60 quoting = QUOTE_MINIMAL
61register_dialect("excel", excel)
62
63class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000064 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000065 delimiter = '\t'
66register_dialect("excel-tab", excel_tab)
67
68
69class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000070 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000071 dialect="excel", *args, **kwds):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000072 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000073 self.restkey = restkey # key to catch long rows
74 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 self.reader = reader(f, dialect, *args, **kwds)
Christian Heimes4fbc72b2008-03-22 00:47:35 +000076 self.dialect = dialect
77 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000082 @property
83 def fieldnames(self):
84 if self._fieldnames is None:
85 try:
86 self._fieldnames = next(self.reader)
87 except StopIteration:
88 pass
89 self.line_num = self.reader.line_num
90 return self._fieldnames
91
92 @fieldnames.setter
93 def fieldnames(self, value):
94 self._fieldnames = value
95
Georg Brandla18af4e2007-04-21 15:47:16 +000096 def __next__(self):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000097 if self.line_num == 0:
98 # Used only for its side effect.
99 self.fieldnames
Georg Brandla18af4e2007-04-21 15:47:16 +0000100 row = next(self.reader)
Christian Heimes4fbc72b2008-03-22 00:47:35 +0000101 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000102
Skip Montanaro04ae7052003-04-24 20:21:31 +0000103 # unlike the basic reader, we prefer not to return blanks,
104 # because we will typically wind up with a dict full of None
105 # values
106 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +0000107 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000108 d = dict(zip(self.fieldnames, row))
109 lf = len(self.fieldnames)
110 lr = len(row)
111 if lf < lr:
112 d[self.restkey] = row[lf:]
113 elif lf > lr:
114 for key in self.fieldnames[lr:]:
115 d[key] = self.restval
116 return d
117
118
119class DictWriter:
120 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000121 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000122 self.fieldnames = fieldnames # list of keys for the dict
123 self.restval = restval # for writing short dicts
124 if extrasaction.lower() not in ("raise", "ignore"):
Collin Winterce36ad82007-08-30 01:19:48 +0000125 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
126 % extrasaction)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000127 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000128 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000129
130 def _dict_to_list(self, rowdict):
131 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000132 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
133 if wrong_fields:
Collin Winterce36ad82007-08-30 01:19:48 +0000134 raise ValueError("dict contains fields not in fieldnames: "
135 + ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000136 return [rowdict.get(key, self.restval) for key in self.fieldnames]
137
138 def writerow(self, rowdict):
139 return self.writer.writerow(self._dict_to_list(rowdict))
140
141 def writerows(self, rowdicts):
142 rows = []
143 for rowdict in rowdicts:
144 rows.append(self._dict_to_list(rowdict))
145 return self.writer.writerows(rows)
146
Raymond Hettinger39a55922003-06-12 03:01:55 +0000147# Guard Sniffer's type checking against builds that exclude complex()
148try:
149 complex
150except NameError:
151 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152
153class Sniffer:
154 '''
155 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000156 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000157 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000158 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000159 # in case there is more than one possible delimiter
160 self.preferred = [',', '\t', ';', ' ', ':']
161
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162
Skip Montanaro77892372003-05-19 15:33:36 +0000163 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000165 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167
Benjamin Petersonf3d7dbe2009-10-04 14:54:52 +0000168 quotechar, doublequote, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000169 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000170 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000171 delimiter, skipinitialspace = self._guess_delimiter(sample,
172 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000173
Skip Montanaro39b29be2005-12-30 05:09:48 +0000174 if not delimiter:
Collin Winterce36ad82007-08-30 01:19:48 +0000175 raise Error("Could not determine delimiter")
Skip Montanaro39b29be2005-12-30 05:09:48 +0000176
Skip Montanaro1448d472003-04-25 14:47:16 +0000177 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178 _name = "sniffed"
179 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000180 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181 # escapechar = ''
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182
Benjamin Petersonf3d7dbe2009-10-04 14:54:52 +0000183 dialect.doublequote = doublequote
Skip Montanaro1448d472003-04-25 14:47:16 +0000184 dialect.delimiter = delimiter
185 # _csv.reader won't accept a quotechar of ''
186 dialect.quotechar = quotechar or '"'
187 dialect.skipinitialspace = skipinitialspace
188
189 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000190
191
Skip Montanaro77892372003-05-19 15:33:36 +0000192 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000193 """
194 Looks for text enclosed between two identical quotes
195 (the probable quotechar) which are preceded and followed
196 by the same character (the probable delimiter).
197 For example:
198 ,'some text',
199 The quote with the most wins, same with the delimiter.
200 If there is no quotechar the delimiter can't be determined
201 this way.
202 """
203
204 matches = []
205 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
206 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
207 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000209 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000210 matches = regexp.findall(data)
211 if matches:
212 break
213
214 if not matches:
Benjamin Petersonf3d7dbe2009-10-04 14:54:52 +0000215 # (quotechar, doublequote, delimiter, skipinitialspace)
216 return ('', False, None, 0)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000217 quotes = {}
218 delims = {}
219 spaces = 0
220 for m in matches:
221 n = regexp.groupindex['quote'] - 1
222 key = m[n]
223 if key:
224 quotes[key] = quotes.get(key, 0) + 1
225 try:
226 n = regexp.groupindex['delim'] - 1
227 key = m[n]
228 except KeyError:
229 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000230 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000231 delims[key] = delims.get(key, 0) + 1
232 try:
233 n = regexp.groupindex['space'] - 1
234 except KeyError:
235 continue
236 if m[n]:
237 spaces += 1
238
Guido van Rossum89da5d72006-08-22 00:21:25 +0000239 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000240
241 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000242 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000243 skipinitialspace = delims[delim] == spaces
244 if delim == '\n': # most likely a file with a single column
245 delim = ''
246 else:
247 # there is *no* delimiter, it's a single column of quoted data
248 delim = ''
249 skipinitialspace = 0
250
Benjamin Petersonf3d7dbe2009-10-04 14:54:52 +0000251 # if we see an extra quote between delimiters, we've got a
252 # double quoted format
253 dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
254 {'delim':delim, 'quote':quotechar}, re.MULTILINE)
255
256
257
258 if dq_regexp.search(data):
259 doublequote = True
260 else:
261 doublequote = False
262
263 return (quotechar, doublequote, delim, skipinitialspace)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000264
265
Skip Montanaro77892372003-05-19 15:33:36 +0000266 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000267 """
268 The delimiter /should/ occur the same number of times on
269 each row. However, due to malformed data, it may not. We don't want
270 an all or nothing approach, so we allow for small variations in this
271 number.
272 1) build a table of the frequency of each character on every line.
Ezio Melotti13925002011-03-16 11:05:33 +0200273 2) build a table of frequencies of this frequency (meta-frequency?),
Skip Montanaro04ae7052003-04-24 20:21:31 +0000274 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
275 7 times in 2 rows'
276 3) use the mode of the meta-frequency to determine the /expected/
277 frequency for that character
278 4) find out how often the character actually meets that goal
279 5) the character that best meets its goal is the delimiter
280 For performance reasons, the data is evaluated in chunks, so it can
281 try and evaluate the smallest portion of the data possible, evaluating
282 additional chunks as necessary.
283 """
284
Guido van Rossumc1f779c2007-07-03 08:25:58 +0000285 data = list(filter(None, data.split('\n')))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000286
287 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
288
289 # build frequency tables
290 chunkLength = min(10, len(data))
291 iteration = 0
292 charFrequency = {}
293 modes = {}
294 delims = {}
295 start, end = 0, min(chunkLength, len(data))
296 while start < len(data):
297 iteration += 1
298 for line in data[start:end]:
299 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000300 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000301 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000302 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000303 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000304 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
305 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000306
307 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000308 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000309 if len(items) == 1 and items[0][0] == 0:
310 continue
311 # get the mode of the frequencies
312 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000313 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000314 # adjust the mode - subtract the sum of all
315 # other frequencies
316 items.remove(modes[char])
317 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000318 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000319 else:
320 modes[char] = items[0]
321
322 # build a list of possible delimiters
323 modeList = modes.items()
324 total = float(chunkLength * iteration)
325 # (rows of consistent data) / (number of rows) = 100%
326 consistency = 1.0
327 # minimum consistency threshold
328 threshold = 0.9
329 while len(delims) == 0 and consistency >= threshold:
330 for k, v in modeList:
331 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000332 if ((v[1]/total) >= consistency and
333 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000334 delims[k] = v
335 consistency -= 0.01
336
337 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000338 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000339 skipinitialspace = (data[0].count(delim) ==
340 data[0].count("%c " % delim))
341 return (delim, skipinitialspace)
342
343 # analyze another chunkLength lines
344 start = end
345 end += chunkLength
346
347 if not delims:
348 return ('', 0)
349
350 # if there's more than one, fall back to a 'preferred' list
351 if len(delims) > 1:
352 for d in self.preferred:
353 if d in delims.keys():
354 skipinitialspace = (data[0].count(d) ==
355 data[0].count("%c " % d))
356 return (d, skipinitialspace)
357
Skip Montanaro39b29be2005-12-30 05:09:48 +0000358 # nothing else indicates a preference, pick the character that
359 # dominates(?)
360 items = [(v,k) for (k,v) in delims.items()]
361 items.sort()
362 delim = items[-1][1]
363
Skip Montanaro04ae7052003-04-24 20:21:31 +0000364 skipinitialspace = (data[0].count(delim) ==
365 data[0].count("%c " % delim))
366 return (delim, skipinitialspace)
367
368
Skip Montanaro1448d472003-04-25 14:47:16 +0000369 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000370 # Creates a dictionary of types of data in each column. If any
371 # column is of a single type (say, integers), *except* for the first
372 # row, then the first row is presumed to be labels. If the type
373 # can't be determined, it is assumed to be a string in which case
374 # the length of the string is the determining factor: if all of the
375 # rows except for the first are the same length, it's a header.
376 # Finally, a 'vote' is taken at the end for each column, adding or
377 # subtracting from the likelihood of the first row being a header.
378
Skip Montanaro1448d472003-04-25 14:47:16 +0000379 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000380
Georg Brandla18af4e2007-04-21 15:47:16 +0000381 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000382
383 columns = len(header)
384 columnTypes = {}
385 for i in range(columns): columnTypes[i] = None
386
387 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000388 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000389 # arbitrary number of rows to check, to keep it sane
390 if checked > 20:
391 break
392 checked += 1
393
394 if len(row) != columns:
395 continue # skip rows that have irregular number of columns
396
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000397 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000398
Amaury Forgeot d'Arca4618732008-04-24 18:26:53 +0000399 for thisType in [int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000400 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000401 thisType(row[col])
402 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000403 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000404 pass
405 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000406 # fallback to length of string
407 thisType = len(row[col])
408
409 if thisType != columnTypes[col]:
410 if columnTypes[col] is None: # add new column type
411 columnTypes[col] = thisType
412 else:
413 # type is inconsistent, remove column from
414 # consideration
415 del columnTypes[col]
416
417 # finally, compare results against first row and "vote"
418 # on whether it's a header
419 hasHeader = 0
420 for col, colType in columnTypes.items():
421 if type(colType) == type(0): # it's a length
422 if len(header[col]) != colType:
423 hasHeader += 1
424 else:
425 hasHeader -= 1
426 else: # attempt typecast
427 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000428 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000429 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000430 hasHeader += 1
431 else:
432 hasHeader -= 1
433
434 return hasHeader > 0