blob: 32a377e8f0486fe9ab39a787b6659a04468fb7d0 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
18 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
19 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
20
21class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000022 """Describe an Excel dialect.
23
24 This must be subclassed (see csv.excel). Valid attributes are:
25 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
26 lineterminator, quoting.
27
28 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000029 _name = ""
30 _valid = False
31 # placeholders
32 delimiter = None
33 quotechar = None
34 escapechar = None
35 doublequote = None
36 skipinitialspace = None
37 lineterminator = None
38 quoting = None
39
40 def __init__(self):
41 if self.__class__ != Dialect:
42 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000043 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000044
45 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 try:
47 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000048 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 # We do this for compatibility with py2.3
50 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000051
52class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000053 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000054 delimiter = ','
55 quotechar = '"'
56 doublequote = True
57 skipinitialspace = False
58 lineterminator = '\r\n'
59 quoting = QUOTE_MINIMAL
60register_dialect("excel", excel)
61
62class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000063 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000064 delimiter = '\t'
65register_dialect("excel-tab", excel_tab)
66
67
68class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000069 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000070 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000071 self.fieldnames = fieldnames # list of keys for the dict
72 self.restkey = restkey # key to catch long rows
73 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000074 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000075
76 def __iter__(self):
77 return self
78
Georg Brandla18af4e2007-04-21 15:47:16 +000079 def __next__(self):
80 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000081 if self.fieldnames is None:
82 self.fieldnames = row
Georg Brandla18af4e2007-04-21 15:47:16 +000083 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000084
Skip Montanaro04ae7052003-04-24 20:21:31 +000085 # unlike the basic reader, we prefer not to return blanks,
86 # because we will typically wind up with a dict full of None
87 # values
88 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +000089 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +000090 d = dict(zip(self.fieldnames, row))
91 lf = len(self.fieldnames)
92 lr = len(row)
93 if lf < lr:
94 d[self.restkey] = row[lf:]
95 elif lf > lr:
96 for key in self.fieldnames[lr:]:
97 d[key] = self.restval
98 return d
99
100
101class DictWriter:
102 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000103 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000104 self.fieldnames = fieldnames # list of keys for the dict
105 self.restval = restval # for writing short dicts
106 if extrasaction.lower() not in ("raise", "ignore"):
Collin Winterce36ad82007-08-30 01:19:48 +0000107 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
108 % extrasaction)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000109 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000110 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000111
112 def _dict_to_list(self, rowdict):
113 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000114 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
115 if wrong_fields:
Collin Winterce36ad82007-08-30 01:19:48 +0000116 raise ValueError("dict contains fields not in fieldnames: "
117 + ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000118 return [rowdict.get(key, self.restval) for key in self.fieldnames]
119
120 def writerow(self, rowdict):
121 return self.writer.writerow(self._dict_to_list(rowdict))
122
123 def writerows(self, rowdicts):
124 rows = []
125 for rowdict in rowdicts:
126 rows.append(self._dict_to_list(rowdict))
127 return self.writer.writerows(rows)
128
Raymond Hettinger39a55922003-06-12 03:01:55 +0000129# Guard Sniffer's type checking against builds that exclude complex()
130try:
131 complex
132except NameError:
133 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000134
135class Sniffer:
136 '''
137 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000138 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000139 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000140 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000141 # in case there is more than one possible delimiter
142 self.preferred = [',', '\t', ';', ' ', ':']
143
Skip Montanaro04ae7052003-04-24 20:21:31 +0000144
Skip Montanaro77892372003-05-19 15:33:36 +0000145 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000146 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000147 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000148 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149
150 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000151 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000152 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000153 delimiter, skipinitialspace = self._guess_delimiter(sample,
154 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000155
Skip Montanaro39b29be2005-12-30 05:09:48 +0000156 if not delimiter:
Collin Winterce36ad82007-08-30 01:19:48 +0000157 raise Error("Could not determine delimiter")
Skip Montanaro39b29be2005-12-30 05:09:48 +0000158
Skip Montanaro1448d472003-04-25 14:47:16 +0000159 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000160 _name = "sniffed"
161 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000162 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000163 # escapechar = ''
164 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000165
Skip Montanaro1448d472003-04-25 14:47:16 +0000166 dialect.delimiter = delimiter
167 # _csv.reader won't accept a quotechar of ''
168 dialect.quotechar = quotechar or '"'
169 dialect.skipinitialspace = skipinitialspace
170
171 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000172
173
Skip Montanaro77892372003-05-19 15:33:36 +0000174 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000175 """
176 Looks for text enclosed between two identical quotes
177 (the probable quotechar) which are preceded and followed
178 by the same character (the probable delimiter).
179 For example:
180 ,'some text',
181 The quote with the most wins, same with the delimiter.
182 If there is no quotechar the delimiter can't be determined
183 this way.
184 """
185
186 matches = []
187 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
188 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
189 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
190 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000191 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000192 matches = regexp.findall(data)
193 if matches:
194 break
195
196 if not matches:
197 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
198
199 quotes = {}
200 delims = {}
201 spaces = 0
202 for m in matches:
203 n = regexp.groupindex['quote'] - 1
204 key = m[n]
205 if key:
206 quotes[key] = quotes.get(key, 0) + 1
207 try:
208 n = regexp.groupindex['delim'] - 1
209 key = m[n]
210 except KeyError:
211 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000212 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000213 delims[key] = delims.get(key, 0) + 1
214 try:
215 n = regexp.groupindex['space'] - 1
216 except KeyError:
217 continue
218 if m[n]:
219 spaces += 1
220
Guido van Rossum89da5d72006-08-22 00:21:25 +0000221 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000222
223 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000224 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000225 skipinitialspace = delims[delim] == spaces
226 if delim == '\n': # most likely a file with a single column
227 delim = ''
228 else:
229 # there is *no* delimiter, it's a single column of quoted data
230 delim = ''
231 skipinitialspace = 0
232
233 return (quotechar, delim, skipinitialspace)
234
235
Skip Montanaro77892372003-05-19 15:33:36 +0000236 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000237 """
238 The delimiter /should/ occur the same number of times on
239 each row. However, due to malformed data, it may not. We don't want
240 an all or nothing approach, so we allow for small variations in this
241 number.
242 1) build a table of the frequency of each character on every line.
243 2) build a table of freqencies of this frequency (meta-frequency?),
244 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
245 7 times in 2 rows'
246 3) use the mode of the meta-frequency to determine the /expected/
247 frequency for that character
248 4) find out how often the character actually meets that goal
249 5) the character that best meets its goal is the delimiter
250 For performance reasons, the data is evaluated in chunks, so it can
251 try and evaluate the smallest portion of the data possible, evaluating
252 additional chunks as necessary.
253 """
254
Guido van Rossumc1f779c2007-07-03 08:25:58 +0000255 data = list(filter(None, data.split('\n')))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000256
257 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
258
259 # build frequency tables
260 chunkLength = min(10, len(data))
261 iteration = 0
262 charFrequency = {}
263 modes = {}
264 delims = {}
265 start, end = 0, min(chunkLength, len(data))
266 while start < len(data):
267 iteration += 1
268 for line in data[start:end]:
269 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000270 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000271 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000272 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000273 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000274 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
275 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000276
277 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000278 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000279 if len(items) == 1 and items[0][0] == 0:
280 continue
281 # get the mode of the frequencies
282 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000283 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000284 # adjust the mode - subtract the sum of all
285 # other frequencies
286 items.remove(modes[char])
287 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000288 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000289 else:
290 modes[char] = items[0]
291
292 # build a list of possible delimiters
293 modeList = modes.items()
294 total = float(chunkLength * iteration)
295 # (rows of consistent data) / (number of rows) = 100%
296 consistency = 1.0
297 # minimum consistency threshold
298 threshold = 0.9
299 while len(delims) == 0 and consistency >= threshold:
300 for k, v in modeList:
301 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000302 if ((v[1]/total) >= consistency and
303 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000304 delims[k] = v
305 consistency -= 0.01
306
307 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000308 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000309 skipinitialspace = (data[0].count(delim) ==
310 data[0].count("%c " % delim))
311 return (delim, skipinitialspace)
312
313 # analyze another chunkLength lines
314 start = end
315 end += chunkLength
316
317 if not delims:
318 return ('', 0)
319
320 # if there's more than one, fall back to a 'preferred' list
321 if len(delims) > 1:
322 for d in self.preferred:
323 if d in delims.keys():
324 skipinitialspace = (data[0].count(d) ==
325 data[0].count("%c " % d))
326 return (d, skipinitialspace)
327
Skip Montanaro39b29be2005-12-30 05:09:48 +0000328 # nothing else indicates a preference, pick the character that
329 # dominates(?)
330 items = [(v,k) for (k,v) in delims.items()]
331 items.sort()
332 delim = items[-1][1]
333
Skip Montanaro04ae7052003-04-24 20:21:31 +0000334 skipinitialspace = (data[0].count(delim) ==
335 data[0].count("%c " % delim))
336 return (delim, skipinitialspace)
337
338
Skip Montanaro1448d472003-04-25 14:47:16 +0000339 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000340 # Creates a dictionary of types of data in each column. If any
341 # column is of a single type (say, integers), *except* for the first
342 # row, then the first row is presumed to be labels. If the type
343 # can't be determined, it is assumed to be a string in which case
344 # the length of the string is the determining factor: if all of the
345 # rows except for the first are the same length, it's a header.
346 # Finally, a 'vote' is taken at the end for each column, adding or
347 # subtracting from the likelihood of the first row being a header.
348
Skip Montanaro1448d472003-04-25 14:47:16 +0000349 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000350
Georg Brandla18af4e2007-04-21 15:47:16 +0000351 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000352
353 columns = len(header)
354 columnTypes = {}
355 for i in range(columns): columnTypes[i] = None
356
357 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000358 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000359 # arbitrary number of rows to check, to keep it sane
360 if checked > 20:
361 break
362 checked += 1
363
364 if len(row) != columns:
365 continue # skip rows that have irregular number of columns
366
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000367 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000368
Guido van Rossume2a383d2007-01-15 16:59:06 +0000369 for thisType in [int, int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000370 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000371 thisType(row[col])
372 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000373 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000374 pass
375 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000376 # fallback to length of string
377 thisType = len(row[col])
378
Raymond Hettinger39a55922003-06-12 03:01:55 +0000379 # treat longs as ints
Guido van Rossume2a383d2007-01-15 16:59:06 +0000380 if thisType == int:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000381 thisType = int
382
Skip Montanaro04ae7052003-04-24 20:21:31 +0000383 if thisType != columnTypes[col]:
384 if columnTypes[col] is None: # add new column type
385 columnTypes[col] = thisType
386 else:
387 # type is inconsistent, remove column from
388 # consideration
389 del columnTypes[col]
390
391 # finally, compare results against first row and "vote"
392 # on whether it's a header
393 hasHeader = 0
394 for col, colType in columnTypes.items():
395 if type(colType) == type(0): # it's a length
396 if len(header[col]) != colType:
397 hasHeader += 1
398 else:
399 hasHeader -= 1
400 else: # attempt typecast
401 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000402 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000403 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000404 hasHeader += 1
405 else:
406 hasHeader -= 1
407
408 return hasHeader > 0