blob: ee62abe5dc6f159d07a9ee0f06c2010476d554af [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
Brett Cannone8d0bf92008-08-18 00:51:19 +000017 "Error", "Dialect", "__doc__", "excel", "excel_tab",
18 "field_size_limit", "reader", "writer",
Skip Montanaro04ae7052003-04-24 20:21:31 +000019 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
21
22class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000023 """Describe an Excel dialect.
24
25 This must be subclassed (see csv.excel). Valid attributes are:
26 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
27 lineterminator, quoting.
28
29 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000030 _name = ""
31 _valid = False
32 # placeholders
33 delimiter = None
34 quotechar = None
35 escapechar = None
36 doublequote = None
37 skipinitialspace = None
38 lineterminator = None
39 quoting = None
40
41 def __init__(self):
42 if self.__class__ != Dialect:
43 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000044 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000045
46 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000047 try:
48 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000049 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000050 # We do this for compatibility with py2.3
51 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000052
53class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000054 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000055 delimiter = ','
56 quotechar = '"'
57 doublequote = True
58 skipinitialspace = False
59 lineterminator = '\r\n'
60 quoting = QUOTE_MINIMAL
61register_dialect("excel", excel)
62
63class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000064 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000065 delimiter = '\t'
66register_dialect("excel-tab", excel_tab)
67
68
69class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000070 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000071 dialect="excel", *args, **kwds):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000072 self._fieldnames = fieldnames # list of keys for the dict
Skip Montanaro04ae7052003-04-24 20:21:31 +000073 self.restkey = restkey # key to catch long rows
74 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000075 self.reader = reader(f, dialect, *args, **kwds)
Christian Heimes4fbc72b2008-03-22 00:47:35 +000076 self.dialect = dialect
77 self.line_num = 0
Skip Montanaro04ae7052003-04-24 20:21:31 +000078
79 def __iter__(self):
80 return self
81
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000082 @property
83 def fieldnames(self):
84 if self._fieldnames is None:
85 try:
86 self._fieldnames = next(self.reader)
87 except StopIteration:
88 pass
89 self.line_num = self.reader.line_num
90 return self._fieldnames
91
92 @fieldnames.setter
93 def fieldnames(self, value):
94 self._fieldnames = value
95
Georg Brandla18af4e2007-04-21 15:47:16 +000096 def __next__(self):
Skip Montanaroaf8fcfa2008-08-09 19:44:22 +000097 if self.line_num == 0:
98 # Used only for its side effect.
99 self.fieldnames
Georg Brandla18af4e2007-04-21 15:47:16 +0000100 row = next(self.reader)
Christian Heimes4fbc72b2008-03-22 00:47:35 +0000101 self.line_num = self.reader.line_num
Skip Montanarodffeed32003-10-03 14:03:01 +0000102
Skip Montanaro04ae7052003-04-24 20:21:31 +0000103 # unlike the basic reader, we prefer not to return blanks,
104 # because we will typically wind up with a dict full of None
105 # values
106 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +0000107 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000108 d = dict(zip(self.fieldnames, row))
109 lf = len(self.fieldnames)
110 lr = len(row)
111 if lf < lr:
112 d[self.restkey] = row[lf:]
113 elif lf > lr:
114 for key in self.fieldnames[lr:]:
115 d[key] = self.restval
116 return d
117
118
119class DictWriter:
120 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000121 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000122 self.fieldnames = fieldnames # list of keys for the dict
123 self.restval = restval # for writing short dicts
124 if extrasaction.lower() not in ("raise", "ignore"):
Collin Winterce36ad82007-08-30 01:19:48 +0000125 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
126 % extrasaction)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000127 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000128 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000129
130 def _dict_to_list(self, rowdict):
131 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000132 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
133 if wrong_fields:
Collin Winterce36ad82007-08-30 01:19:48 +0000134 raise ValueError("dict contains fields not in fieldnames: "
135 + ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000136 return [rowdict.get(key, self.restval) for key in self.fieldnames]
137
138 def writerow(self, rowdict):
139 return self.writer.writerow(self._dict_to_list(rowdict))
140
141 def writerows(self, rowdicts):
142 rows = []
143 for rowdict in rowdicts:
144 rows.append(self._dict_to_list(rowdict))
145 return self.writer.writerows(rows)
146
Raymond Hettinger39a55922003-06-12 03:01:55 +0000147# Guard Sniffer's type checking against builds that exclude complex()
148try:
149 complex
150except NameError:
151 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000152
153class Sniffer:
154 '''
155 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000156 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000157 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000158 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000159 # in case there is more than one possible delimiter
160 self.preferred = [',', '\t', ';', ' ', ':']
161
Skip Montanaro04ae7052003-04-24 20:21:31 +0000162
Skip Montanaro77892372003-05-19 15:33:36 +0000163 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000165 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000167
168 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000169 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000170 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000171 delimiter, skipinitialspace = self._guess_delimiter(sample,
172 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000173
Skip Montanaro39b29be2005-12-30 05:09:48 +0000174 if not delimiter:
Collin Winterce36ad82007-08-30 01:19:48 +0000175 raise Error("Could not determine delimiter")
Skip Montanaro39b29be2005-12-30 05:09:48 +0000176
Skip Montanaro1448d472003-04-25 14:47:16 +0000177 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000178 _name = "sniffed"
179 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000180 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000181 # escapechar = ''
182 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000183
Skip Montanaro1448d472003-04-25 14:47:16 +0000184 dialect.delimiter = delimiter
185 # _csv.reader won't accept a quotechar of ''
186 dialect.quotechar = quotechar or '"'
187 dialect.skipinitialspace = skipinitialspace
188
189 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000190
191
Skip Montanaro77892372003-05-19 15:33:36 +0000192 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000193 """
194 Looks for text enclosed between two identical quotes
195 (the probable quotechar) which are preceded and followed
196 by the same character (the probable delimiter).
197 For example:
198 ,'some text',
199 The quote with the most wins, same with the delimiter.
200 If there is no quotechar the delimiter can't be determined
201 this way.
202 """
203
204 matches = []
205 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
206 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
207 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000209 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000210 matches = regexp.findall(data)
211 if matches:
212 break
213
214 if not matches:
215 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
216
217 quotes = {}
218 delims = {}
219 spaces = 0
220 for m in matches:
221 n = regexp.groupindex['quote'] - 1
222 key = m[n]
223 if key:
224 quotes[key] = quotes.get(key, 0) + 1
225 try:
226 n = regexp.groupindex['delim'] - 1
227 key = m[n]
228 except KeyError:
229 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000230 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000231 delims[key] = delims.get(key, 0) + 1
232 try:
233 n = regexp.groupindex['space'] - 1
234 except KeyError:
235 continue
236 if m[n]:
237 spaces += 1
238
Guido van Rossum89da5d72006-08-22 00:21:25 +0000239 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000240
241 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000242 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000243 skipinitialspace = delims[delim] == spaces
244 if delim == '\n': # most likely a file with a single column
245 delim = ''
246 else:
247 # there is *no* delimiter, it's a single column of quoted data
248 delim = ''
249 skipinitialspace = 0
250
251 return (quotechar, delim, skipinitialspace)
252
253
Skip Montanaro77892372003-05-19 15:33:36 +0000254 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000255 """
256 The delimiter /should/ occur the same number of times on
257 each row. However, due to malformed data, it may not. We don't want
258 an all or nothing approach, so we allow for small variations in this
259 number.
260 1) build a table of the frequency of each character on every line.
261 2) build a table of freqencies of this frequency (meta-frequency?),
262 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
263 7 times in 2 rows'
264 3) use the mode of the meta-frequency to determine the /expected/
265 frequency for that character
266 4) find out how often the character actually meets that goal
267 5) the character that best meets its goal is the delimiter
268 For performance reasons, the data is evaluated in chunks, so it can
269 try and evaluate the smallest portion of the data possible, evaluating
270 additional chunks as necessary.
271 """
272
Guido van Rossumc1f779c2007-07-03 08:25:58 +0000273 data = list(filter(None, data.split('\n')))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000274
275 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
276
277 # build frequency tables
278 chunkLength = min(10, len(data))
279 iteration = 0
280 charFrequency = {}
281 modes = {}
282 delims = {}
283 start, end = 0, min(chunkLength, len(data))
284 while start < len(data):
285 iteration += 1
286 for line in data[start:end]:
287 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000288 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000289 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000290 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000291 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000292 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
293 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000294
295 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000296 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000297 if len(items) == 1 and items[0][0] == 0:
298 continue
299 # get the mode of the frequencies
300 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000301 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000302 # adjust the mode - subtract the sum of all
303 # other frequencies
304 items.remove(modes[char])
305 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000306 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000307 else:
308 modes[char] = items[0]
309
310 # build a list of possible delimiters
311 modeList = modes.items()
312 total = float(chunkLength * iteration)
313 # (rows of consistent data) / (number of rows) = 100%
314 consistency = 1.0
315 # minimum consistency threshold
316 threshold = 0.9
317 while len(delims) == 0 and consistency >= threshold:
318 for k, v in modeList:
319 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000320 if ((v[1]/total) >= consistency and
321 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000322 delims[k] = v
323 consistency -= 0.01
324
325 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000326 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000327 skipinitialspace = (data[0].count(delim) ==
328 data[0].count("%c " % delim))
329 return (delim, skipinitialspace)
330
331 # analyze another chunkLength lines
332 start = end
333 end += chunkLength
334
335 if not delims:
336 return ('', 0)
337
338 # if there's more than one, fall back to a 'preferred' list
339 if len(delims) > 1:
340 for d in self.preferred:
341 if d in delims.keys():
342 skipinitialspace = (data[0].count(d) ==
343 data[0].count("%c " % d))
344 return (d, skipinitialspace)
345
Skip Montanaro39b29be2005-12-30 05:09:48 +0000346 # nothing else indicates a preference, pick the character that
347 # dominates(?)
348 items = [(v,k) for (k,v) in delims.items()]
349 items.sort()
350 delim = items[-1][1]
351
Skip Montanaro04ae7052003-04-24 20:21:31 +0000352 skipinitialspace = (data[0].count(delim) ==
353 data[0].count("%c " % delim))
354 return (delim, skipinitialspace)
355
356
Skip Montanaro1448d472003-04-25 14:47:16 +0000357 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000358 # Creates a dictionary of types of data in each column. If any
359 # column is of a single type (say, integers), *except* for the first
360 # row, then the first row is presumed to be labels. If the type
361 # can't be determined, it is assumed to be a string in which case
362 # the length of the string is the determining factor: if all of the
363 # rows except for the first are the same length, it's a header.
364 # Finally, a 'vote' is taken at the end for each column, adding or
365 # subtracting from the likelihood of the first row being a header.
366
Skip Montanaro1448d472003-04-25 14:47:16 +0000367 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000368
Georg Brandla18af4e2007-04-21 15:47:16 +0000369 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000370
371 columns = len(header)
372 columnTypes = {}
373 for i in range(columns): columnTypes[i] = None
374
375 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000376 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000377 # arbitrary number of rows to check, to keep it sane
378 if checked > 20:
379 break
380 checked += 1
381
382 if len(row) != columns:
383 continue # skip rows that have irregular number of columns
384
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000385 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000386
Amaury Forgeot d'Arca4618732008-04-24 18:26:53 +0000387 for thisType in [int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000388 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000389 thisType(row[col])
390 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000391 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000392 pass
393 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000394 # fallback to length of string
395 thisType = len(row[col])
396
397 if thisType != columnTypes[col]:
398 if columnTypes[col] is None: # add new column type
399 columnTypes[col] = thisType
400 else:
401 # type is inconsistent, remove column from
402 # consideration
403 del columnTypes[col]
404
405 # finally, compare results against first row and "vote"
406 # on whether it's a header
407 hasHeader = 0
408 for col, colType in columnTypes.items():
409 if type(colType) == type(0): # it's a length
410 if len(header[col]) != colType:
411 hasHeader += 1
412 else:
413 hasHeader -= 1
414 else: # attempt typecast
415 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000416 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000417 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000418 hasHeader += 1
419 else:
420 hasHeader -= 1
421
422 return hasHeader > 0