blob: e0558c78d464e4f46f18c19abb3ad2c63038f43a [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
18 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
19 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
20
21class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000022 """Describe an Excel dialect.
23
24 This must be subclassed (see csv.excel). Valid attributes are:
25 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
26 lineterminator, quoting.
27
28 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000029 _name = ""
30 _valid = False
31 # placeholders
32 delimiter = None
33 quotechar = None
34 escapechar = None
35 doublequote = None
36 skipinitialspace = None
37 lineterminator = None
38 quoting = None
39
40 def __init__(self):
41 if self.__class__ != Dialect:
42 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000043 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000044
45 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 try:
47 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000048 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 # We do this for compatibility with py2.3
50 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000051
52class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000053 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000054 delimiter = ','
55 quotechar = '"'
56 doublequote = True
57 skipinitialspace = False
58 lineterminator = '\r\n'
59 quoting = QUOTE_MINIMAL
60register_dialect("excel", excel)
61
62class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000063 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000064 delimiter = '\t'
65register_dialect("excel-tab", excel_tab)
66
67
68class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000069 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000070 dialect="excel", *args, **kwds):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000071 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000072 self.restkey = restkey # key to catch long rows
73 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000074 self.reader = reader(f, dialect, *args, **kwds)
Christian Heimes4fbc72b2008-03-22 00:47:35 +000075 self.dialect = dialect
76 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000077
78 def __iter__(self):
79 return self
80
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000081 @property
82 def fieldnames(self):
83 if self._fieldnames is None:
84 try:
85 self._fieldnames = next(self.reader)
86 except StopIteration:
87 pass
88 self.line_num = self.reader.line_num
89 return self._fieldnames
90
91 @fieldnames.setter
92 def fieldnames(self, value):
93 self._fieldnames = value
94
Georg Brandla18af4e2007-04-21 15:47:16 +000095 def __next__(self):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000096 if self.line_num == 0:
97 # Used only for its side effect.
98 self.fieldnames
Georg Brandla18af4e2007-04-21 15:47:16 +000099 row = next(self.reader)
Christian Heimes4fbc72b2008-03-22 00:47:35 +0000100 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000101
Skip Montanaro04ae7052003-04-24 20:21:31 +0000102 # unlike the basic reader, we prefer not to return blanks,
103 # because we will typically wind up with a dict full of None
104 # values
105 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +0000106 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000107 d = dict(zip(self.fieldnames, row))
108 lf = len(self.fieldnames)
109 lr = len(row)
110 if lf < lr:
111 d[self.restkey] = row[lf:]
112 elif lf > lr:
113 for key in self.fieldnames[lr:]:
114 d[key] = self.restval
115 return d
116
117
118class DictWriter:
119 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000120 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000121 self.fieldnames = fieldnames # list of keys for the dict
122 self.restval = restval # for writing short dicts
123 if extrasaction.lower() not in ("raise", "ignore"):
Collin Winterce36ad82007-08-30 01:19:48 +0000124 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
125 % extrasaction)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000126 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000127 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000128
129 def _dict_to_list(self, rowdict):
130 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000131 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
132 if wrong_fields:
Collin Winterce36ad82007-08-30 01:19:48 +0000133 raise ValueError("dict contains fields not in fieldnames: "
134 + ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000135 return [rowdict.get(key, self.restval) for key in self.fieldnames]
136
137 def writerow(self, rowdict):
138 return self.writer.writerow(self._dict_to_list(rowdict))
139
140 def writerows(self, rowdicts):
141 rows = []
142 for rowdict in rowdicts:
143 rows.append(self._dict_to_list(rowdict))
144 return self.writer.writerows(rows)
145
Raymond Hettinger39a55922003-06-12 03:01:55 +0000146# Guard Sniffer's type checking against builds that exclude complex()
147try:
148 complex
149except NameError:
150 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000151
152class Sniffer:
153 '''
154 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000155 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000157 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000158 # in case there is more than one possible delimiter
159 self.preferred = [',', '\t', ';', ' ', ':']
160
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161
Skip Montanaro77892372003-05-19 15:33:36 +0000162 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000164 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
167 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000168 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000169 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000170 delimiter, skipinitialspace = self._guess_delimiter(sample,
171 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
Skip Montanaro39b29be2005-12-30 05:09:48 +0000173 if not delimiter:
Collin Winterce36ad82007-08-30 01:19:48 +0000174 raise Error("Could not determine delimiter")
Skip Montanaro39b29be2005-12-30 05:09:48 +0000175
Skip Montanaro1448d472003-04-25 14:47:16 +0000176 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000177 _name = "sniffed"
178 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000179 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000180 # escapechar = ''
181 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000182
Skip Montanaro1448d472003-04-25 14:47:16 +0000183 dialect.delimiter = delimiter
184 # _csv.reader won't accept a quotechar of ''
185 dialect.quotechar = quotechar or '"'
186 dialect.skipinitialspace = skipinitialspace
187
188 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000189
190
Skip Montanaro77892372003-05-19 15:33:36 +0000191 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000192 """
193 Looks for text enclosed between two identical quotes
194 (the probable quotechar) which are preceded and followed
195 by the same character (the probable delimiter).
196 For example:
197 ,'some text',
198 The quote with the most wins, same with the delimiter.
199 If there is no quotechar the delimiter can't be determined
200 this way.
201 """
202
203 matches = []
204 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
205 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
206 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
207 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000208 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000209 matches = regexp.findall(data)
210 if matches:
211 break
212
213 if not matches:
214 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
215
216 quotes = {}
217 delims = {}
218 spaces = 0
219 for m in matches:
220 n = regexp.groupindex['quote'] - 1
221 key = m[n]
222 if key:
223 quotes[key] = quotes.get(key, 0) + 1
224 try:
225 n = regexp.groupindex['delim'] - 1
226 key = m[n]
227 except KeyError:
228 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000229 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000230 delims[key] = delims.get(key, 0) + 1
231 try:
232 n = regexp.groupindex['space'] - 1
233 except KeyError:
234 continue
235 if m[n]:
236 spaces += 1
237
Guido van Rossum89da5d72006-08-22 00:21:25 +0000238 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000239
240 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000241 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000242 skipinitialspace = delims[delim] == spaces
243 if delim == '\n': # most likely a file with a single column
244 delim = ''
245 else:
246 # there is *no* delimiter, it's a single column of quoted data
247 delim = ''
248 skipinitialspace = 0
249
250 return (quotechar, delim, skipinitialspace)
251
252
Skip Montanaro77892372003-05-19 15:33:36 +0000253 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000254 """
255 The delimiter /should/ occur the same number of times on
256 each row. However, due to malformed data, it may not. We don't want
257 an all or nothing approach, so we allow for small variations in this
258 number.
259 1) build a table of the frequency of each character on every line.
260 2) build a table of freqencies of this frequency (meta-frequency?),
261 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
262 7 times in 2 rows'
263 3) use the mode of the meta-frequency to determine the /expected/
264 frequency for that character
265 4) find out how often the character actually meets that goal
266 5) the character that best meets its goal is the delimiter
267 For performance reasons, the data is evaluated in chunks, so it can
268 try and evaluate the smallest portion of the data possible, evaluating
269 additional chunks as necessary.
270 """
271
Guido van Rossumc1f779c2007-07-03 08:25:58 +0000272 data = list(filter(None, data.split('\n')))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000273
274 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
275
276 # build frequency tables
277 chunkLength = min(10, len(data))
278 iteration = 0
279 charFrequency = {}
280 modes = {}
281 delims = {}
282 start, end = 0, min(chunkLength, len(data))
283 while start < len(data):
284 iteration += 1
285 for line in data[start:end]:
286 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000287 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000288 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000289 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000290 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000291 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
292 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000293
294 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000295 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000296 if len(items) == 1 and items[0][0] == 0:
297 continue
298 # get the mode of the frequencies
299 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000300 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000301 # adjust the mode - subtract the sum of all
302 # other frequencies
303 items.remove(modes[char])
304 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000305 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000306 else:
307 modes[char] = items[0]
308
309 # build a list of possible delimiters
310 modeList = modes.items()
311 total = float(chunkLength * iteration)
312 # (rows of consistent data) / (number of rows) = 100%
313 consistency = 1.0
314 # minimum consistency threshold
315 threshold = 0.9
316 while len(delims) == 0 and consistency >= threshold:
317 for k, v in modeList:
318 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000319 if ((v[1]/total) >= consistency and
320 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000321 delims[k] = v
322 consistency -= 0.01
323
324 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000325 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000326 skipinitialspace = (data[0].count(delim) ==
327 data[0].count("%c " % delim))
328 return (delim, skipinitialspace)
329
330 # analyze another chunkLength lines
331 start = end
332 end += chunkLength
333
334 if not delims:
335 return ('', 0)
336
337 # if there's more than one, fall back to a 'preferred' list
338 if len(delims) > 1:
339 for d in self.preferred:
340 if d in delims.keys():
341 skipinitialspace = (data[0].count(d) ==
342 data[0].count("%c " % d))
343 return (d, skipinitialspace)
344
Skip Montanaro39b29be2005-12-30 05:09:48 +0000345 # nothing else indicates a preference, pick the character that
346 # dominates(?)
347 items = [(v,k) for (k,v) in delims.items()]
348 items.sort()
349 delim = items[-1][1]
350
Skip Montanaro04ae7052003-04-24 20:21:31 +0000351 skipinitialspace = (data[0].count(delim) ==
352 data[0].count("%c " % delim))
353 return (delim, skipinitialspace)
354
355
Skip Montanaro1448d472003-04-25 14:47:16 +0000356 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000357 # Creates a dictionary of types of data in each column. If any
358 # column is of a single type (say, integers), *except* for the first
359 # row, then the first row is presumed to be labels. If the type
360 # can't be determined, it is assumed to be a string in which case
361 # the length of the string is the determining factor: if all of the
362 # rows except for the first are the same length, it's a header.
363 # Finally, a 'vote' is taken at the end for each column, adding or
364 # subtracting from the likelihood of the first row being a header.
365
Skip Montanaro1448d472003-04-25 14:47:16 +0000366 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000367
Georg Brandla18af4e2007-04-21 15:47:16 +0000368 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000369
370 columns = len(header)
371 columnTypes = {}
372 for i in range(columns): columnTypes[i] = None
373
374 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000375 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000376 # arbitrary number of rows to check, to keep it sane
377 if checked > 20:
378 break
379 checked += 1
380
381 if len(row) != columns:
382 continue # skip rows that have irregular number of columns
383
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000384 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000385
Amaury Forgeot d'Arca4618732008-04-24 18:26:53 +0000386 for thisType in [int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000387 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000388 thisType(row[col])
389 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000390 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000391 pass
392 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000393 # fallback to length of string
394 thisType = len(row[col])
395
396 if thisType != columnTypes[col]:
397 if columnTypes[col] is None: # add new column type
398 columnTypes[col] = thisType
399 else:
400 # type is inconsistent, remove column from
401 # consideration
402 del columnTypes[col]
403
404 # finally, compare results against first row and "vote"
405 # on whether it's a header
406 hasHeader = 0
407 for col, colType in columnTypes.items():
408 if type(colType) == type(0): # it's a length
409 if len(header[col]) != colType:
410 hasHeader += 1
411 else:
412 hasHeader -= 1
413 else: # attempt typecast
414 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000415 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000416 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000417 hasHeader += 1
418 else:
419 hasHeader -= 1
420
421 return hasHeader > 0