blob: 758c79a4d88bc00129d142320cb4318a6197d663 [file] [log] [blame]
Skip Montanaro04ae7052003-04-24 20:21:31 +00001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
Andrew McNamara31d88962005-01-12 03:45:10 +00009 field_size_limit, \
Skip Montanaro04ae7052003-04-24 20:21:31 +000010 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
Andrew McNamara7130ff52005-01-11 02:22:47 +000012from _csv import Dialect as _Dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +000013
Guido van Rossum68937b42007-05-18 00:51:22 +000014from io import StringIO
Skip Montanaro1448d472003-04-25 14:47:16 +000015
Skip Montanaro04ae7052003-04-24 20:21:31 +000016__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
18 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
19 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
20
21class Dialect:
Skip Montanarof26285c2005-01-05 06:54:58 +000022 """Describe an Excel dialect.
23
24 This must be subclassed (see csv.excel). Valid attributes are:
25 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
26 lineterminator, quoting.
27
28 """
Skip Montanaro04ae7052003-04-24 20:21:31 +000029 _name = ""
30 _valid = False
31 # placeholders
32 delimiter = None
33 quotechar = None
34 escapechar = None
35 doublequote = None
36 skipinitialspace = None
37 lineterminator = None
38 quoting = None
39
40 def __init__(self):
41 if self.__class__ != Dialect:
42 self._valid = True
Andrew McNamara7130ff52005-01-11 02:22:47 +000043 self._validate()
Skip Montanaro04ae7052003-04-24 20:21:31 +000044
45 def _validate(self):
Andrew McNamara7130ff52005-01-11 02:22:47 +000046 try:
47 _Dialect(self)
Guido van Rossumb940e112007-01-10 16:19:56 +000048 except TypeError as e:
Andrew McNamara7130ff52005-01-11 02:22:47 +000049 # We do this for compatibility with py2.3
50 raise Error(str(e))
Skip Montanaro04ae7052003-04-24 20:21:31 +000051
52class excel(Dialect):
Skip Montanarof26285c2005-01-05 06:54:58 +000053 """Describe the usual properties of Excel-generated CSV files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000054 delimiter = ','
55 quotechar = '"'
56 doublequote = True
57 skipinitialspace = False
58 lineterminator = '\r\n'
59 quoting = QUOTE_MINIMAL
60register_dialect("excel", excel)
61
62class excel_tab(excel):
Skip Montanarof26285c2005-01-05 06:54:58 +000063 """Describe the usual properties of Excel-generated TAB-delimited files."""
Skip Montanaro04ae7052003-04-24 20:21:31 +000064 delimiter = '\t'
65register_dialect("excel-tab", excel_tab)
66
67
68class DictReader:
Skip Montanarodffeed32003-10-03 14:03:01 +000069 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
Skip Montanaro3f7a9482003-09-06 19:52:12 +000070 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +000071 self.fieldnames = fieldnames # list of keys for the dict
72 self.restkey = restkey # key to catch long rows
73 self.restval = restval # default value for short rows
Skip Montanaro3f7a9482003-09-06 19:52:12 +000074 self.reader = reader(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +000075
76 def __iter__(self):
77 return self
78
Georg Brandla18af4e2007-04-21 15:47:16 +000079 def __next__(self):
80 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000081 if self.fieldnames is None:
82 self.fieldnames = row
Georg Brandla18af4e2007-04-21 15:47:16 +000083 row = next(self.reader)
Skip Montanarodffeed32003-10-03 14:03:01 +000084
Skip Montanaro04ae7052003-04-24 20:21:31 +000085 # unlike the basic reader, we prefer not to return blanks,
86 # because we will typically wind up with a dict full of None
87 # values
88 while row == []:
Georg Brandla18af4e2007-04-21 15:47:16 +000089 row = next(self.reader)
Skip Montanaro04ae7052003-04-24 20:21:31 +000090 d = dict(zip(self.fieldnames, row))
91 lf = len(self.fieldnames)
92 lr = len(row)
93 if lf < lr:
94 d[self.restkey] = row[lf:]
95 elif lf > lr:
96 for key in self.fieldnames[lr:]:
97 d[key] = self.restval
98 return d
99
100
101class DictWriter:
102 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000103 dialect="excel", *args, **kwds):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000104 self.fieldnames = fieldnames # list of keys for the dict
105 self.restval = restval # for writing short dicts
106 if extrasaction.lower() not in ("raise", "ignore"):
107 raise ValueError, \
108 ("extrasaction (%s) must be 'raise' or 'ignore'" %
109 extrasaction)
110 self.extrasaction = extrasaction
Skip Montanaro3f7a9482003-09-06 19:52:12 +0000111 self.writer = writer(f, dialect, *args, **kwds)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000112
113 def _dict_to_list(self, rowdict):
114 if self.extrasaction == "raise":
Guido van Rossumd8faa362007-04-27 19:54:29 +0000115 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
116 if wrong_fields:
117 raise ValueError("dict contains fields not in fieldnames: " +
118 ", ".join(wrong_fields))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000119 return [rowdict.get(key, self.restval) for key in self.fieldnames]
120
121 def writerow(self, rowdict):
122 return self.writer.writerow(self._dict_to_list(rowdict))
123
124 def writerows(self, rowdicts):
125 rows = []
126 for rowdict in rowdicts:
127 rows.append(self._dict_to_list(rowdict))
128 return self.writer.writerows(rows)
129
Raymond Hettinger39a55922003-06-12 03:01:55 +0000130# Guard Sniffer's type checking against builds that exclude complex()
131try:
132 complex
133except NameError:
134 complex = float
Skip Montanaro04ae7052003-04-24 20:21:31 +0000135
136class Sniffer:
137 '''
138 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Skip Montanaro1448d472003-04-25 14:47:16 +0000139 Returns a Dialect object.
Skip Montanaro04ae7052003-04-24 20:21:31 +0000140 '''
Skip Montanaro1448d472003-04-25 14:47:16 +0000141 def __init__(self):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000142 # in case there is more than one possible delimiter
143 self.preferred = [',', '\t', ';', ' ', ':']
144
Skip Montanaro04ae7052003-04-24 20:21:31 +0000145
Skip Montanaro77892372003-05-19 15:33:36 +0000146 def sniff(self, sample, delimiters=None):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000147 """
Skip Montanaro1448d472003-04-25 14:47:16 +0000148 Returns a dialect (or None) corresponding to the sample
Skip Montanaro04ae7052003-04-24 20:21:31 +0000149 """
Skip Montanaro04ae7052003-04-24 20:21:31 +0000150
151 quotechar, delimiter, skipinitialspace = \
Skip Montanaro77892372003-05-19 15:33:36 +0000152 self._guess_quote_and_delimiter(sample, delimiters)
Skip Montanaro39b29be2005-12-30 05:09:48 +0000153 if not delimiter:
Skip Montanaro77892372003-05-19 15:33:36 +0000154 delimiter, skipinitialspace = self._guess_delimiter(sample,
155 delimiters)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000156
Skip Montanaro39b29be2005-12-30 05:09:48 +0000157 if not delimiter:
158 raise Error, "Could not determine delimiter"
159
Skip Montanaro1448d472003-04-25 14:47:16 +0000160 class dialect(Dialect):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000161 _name = "sniffed"
162 lineterminator = '\r\n'
Fred Drake7c852f32003-04-25 14:27:00 +0000163 quoting = QUOTE_MINIMAL
Skip Montanaro04ae7052003-04-24 20:21:31 +0000164 # escapechar = ''
165 doublequote = False
Skip Montanaro04ae7052003-04-24 20:21:31 +0000166
Skip Montanaro1448d472003-04-25 14:47:16 +0000167 dialect.delimiter = delimiter
168 # _csv.reader won't accept a quotechar of ''
169 dialect.quotechar = quotechar or '"'
170 dialect.skipinitialspace = skipinitialspace
171
172 return dialect
Skip Montanaro04ae7052003-04-24 20:21:31 +0000173
174
Skip Montanaro77892372003-05-19 15:33:36 +0000175 def _guess_quote_and_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000176 """
177 Looks for text enclosed between two identical quotes
178 (the probable quotechar) which are preceded and followed
179 by the same character (the probable delimiter).
180 For example:
181 ,'some text',
182 The quote with the most wins, same with the delimiter.
183 If there is no quotechar the delimiter can't be determined
184 this way.
185 """
186
187 matches = []
188 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
189 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
190 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
191 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
Fred Drake6f7b2132003-09-02 16:01:07 +0000192 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000193 matches = regexp.findall(data)
194 if matches:
195 break
196
197 if not matches:
198 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
199
200 quotes = {}
201 delims = {}
202 spaces = 0
203 for m in matches:
204 n = regexp.groupindex['quote'] - 1
205 key = m[n]
206 if key:
207 quotes[key] = quotes.get(key, 0) + 1
208 try:
209 n = regexp.groupindex['delim'] - 1
210 key = m[n]
211 except KeyError:
212 continue
Skip Montanaro77892372003-05-19 15:33:36 +0000213 if key and (delimiters is None or key in delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000214 delims[key] = delims.get(key, 0) + 1
215 try:
216 n = regexp.groupindex['space'] - 1
217 except KeyError:
218 continue
219 if m[n]:
220 spaces += 1
221
Guido van Rossum89da5d72006-08-22 00:21:25 +0000222 quotechar = max(quotes, key=quotes.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000223
224 if delims:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000225 delim = max(delims, key=delims.get)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000226 skipinitialspace = delims[delim] == spaces
227 if delim == '\n': # most likely a file with a single column
228 delim = ''
229 else:
230 # there is *no* delimiter, it's a single column of quoted data
231 delim = ''
232 skipinitialspace = 0
233
234 return (quotechar, delim, skipinitialspace)
235
236
Skip Montanaro77892372003-05-19 15:33:36 +0000237 def _guess_delimiter(self, data, delimiters):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000238 """
239 The delimiter /should/ occur the same number of times on
240 each row. However, due to malformed data, it may not. We don't want
241 an all or nothing approach, so we allow for small variations in this
242 number.
243 1) build a table of the frequency of each character on every line.
244 2) build a table of freqencies of this frequency (meta-frequency?),
245 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
246 7 times in 2 rows'
247 3) use the mode of the meta-frequency to determine the /expected/
248 frequency for that character
249 4) find out how often the character actually meets that goal
250 5) the character that best meets its goal is the delimiter
251 For performance reasons, the data is evaluated in chunks, so it can
252 try and evaluate the smallest portion of the data possible, evaluating
253 additional chunks as necessary.
254 """
255
256 data = filter(None, data.split('\n'))
257
258 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
259
260 # build frequency tables
261 chunkLength = min(10, len(data))
262 iteration = 0
263 charFrequency = {}
264 modes = {}
265 delims = {}
266 start, end = 0, min(chunkLength, len(data))
267 while start < len(data):
268 iteration += 1
269 for line in data[start:end]:
270 for char in ascii:
Skip Montanaro1448d472003-04-25 14:47:16 +0000271 metaFrequency = charFrequency.get(char, {})
Skip Montanaro04ae7052003-04-24 20:21:31 +0000272 # must count even if frequency is 0
Skip Montanaro91bb70c2005-12-28 15:37:25 +0000273 freq = line.count(char)
Skip Montanaro04ae7052003-04-24 20:21:31 +0000274 # value is the mode
Skip Montanaro1448d472003-04-25 14:47:16 +0000275 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
276 charFrequency[char] = metaFrequency
Skip Montanaro04ae7052003-04-24 20:21:31 +0000277
278 for char in charFrequency.keys():
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000279 items = list(charFrequency[char].items())
Skip Montanaro04ae7052003-04-24 20:21:31 +0000280 if len(items) == 1 and items[0][0] == 0:
281 continue
282 # get the mode of the frequencies
283 if len(items) > 1:
Guido van Rossum89da5d72006-08-22 00:21:25 +0000284 modes[char] = max(items, key=lambda x: x[1])
Skip Montanaro04ae7052003-04-24 20:21:31 +0000285 # adjust the mode - subtract the sum of all
286 # other frequencies
287 items.remove(modes[char])
288 modes[char] = (modes[char][0], modes[char][1]
Guido van Rossum89da5d72006-08-22 00:21:25 +0000289 - sum(item[1] for item in items))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000290 else:
291 modes[char] = items[0]
292
293 # build a list of possible delimiters
294 modeList = modes.items()
295 total = float(chunkLength * iteration)
296 # (rows of consistent data) / (number of rows) = 100%
297 consistency = 1.0
298 # minimum consistency threshold
299 threshold = 0.9
300 while len(delims) == 0 and consistency >= threshold:
301 for k, v in modeList:
302 if v[0] > 0 and v[1] > 0:
Skip Montanaro77892372003-05-19 15:33:36 +0000303 if ((v[1]/total) >= consistency and
304 (delimiters is None or k in delimiters)):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000305 delims[k] = v
306 consistency -= 0.01
307
308 if len(delims) == 1:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000309 delim = list(delims.keys())[0]
Skip Montanaro04ae7052003-04-24 20:21:31 +0000310 skipinitialspace = (data[0].count(delim) ==
311 data[0].count("%c " % delim))
312 return (delim, skipinitialspace)
313
314 # analyze another chunkLength lines
315 start = end
316 end += chunkLength
317
318 if not delims:
319 return ('', 0)
320
321 # if there's more than one, fall back to a 'preferred' list
322 if len(delims) > 1:
323 for d in self.preferred:
324 if d in delims.keys():
325 skipinitialspace = (data[0].count(d) ==
326 data[0].count("%c " % d))
327 return (d, skipinitialspace)
328
Skip Montanaro39b29be2005-12-30 05:09:48 +0000329 # nothing else indicates a preference, pick the character that
330 # dominates(?)
331 items = [(v,k) for (k,v) in delims.items()]
332 items.sort()
333 delim = items[-1][1]
334
Skip Montanaro04ae7052003-04-24 20:21:31 +0000335 skipinitialspace = (data[0].count(delim) ==
336 data[0].count("%c " % delim))
337 return (delim, skipinitialspace)
338
339
Skip Montanaro1448d472003-04-25 14:47:16 +0000340 def has_header(self, sample):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000341 # Creates a dictionary of types of data in each column. If any
342 # column is of a single type (say, integers), *except* for the first
343 # row, then the first row is presumed to be labels. If the type
344 # can't be determined, it is assumed to be a string in which case
345 # the length of the string is the determining factor: if all of the
346 # rows except for the first are the same length, it's a header.
347 # Finally, a 'vote' is taken at the end for each column, adding or
348 # subtracting from the likelihood of the first row being a header.
349
Skip Montanaro1448d472003-04-25 14:47:16 +0000350 rdr = reader(StringIO(sample), self.sniff(sample))
Skip Montanaro04ae7052003-04-24 20:21:31 +0000351
Georg Brandla18af4e2007-04-21 15:47:16 +0000352 header = next(rdr) # assume first row is header
Skip Montanaro04ae7052003-04-24 20:21:31 +0000353
354 columns = len(header)
355 columnTypes = {}
356 for i in range(columns): columnTypes[i] = None
357
358 checked = 0
Skip Montanaro1448d472003-04-25 14:47:16 +0000359 for row in rdr:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000360 # arbitrary number of rows to check, to keep it sane
361 if checked > 20:
362 break
363 checked += 1
364
365 if len(row) != columns:
366 continue # skip rows that have irregular number of columns
367
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000368 for col in list(columnTypes.keys()):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000369
Guido van Rossume2a383d2007-01-15 16:59:06 +0000370 for thisType in [int, int, float, complex]:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000371 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000372 thisType(row[col])
373 break
Raymond Hettingerabe14e62003-06-12 03:59:17 +0000374 except (ValueError, OverflowError):
Raymond Hettinger39a55922003-06-12 03:01:55 +0000375 pass
376 else:
Skip Montanaro04ae7052003-04-24 20:21:31 +0000377 # fallback to length of string
378 thisType = len(row[col])
379
Raymond Hettinger39a55922003-06-12 03:01:55 +0000380 # treat longs as ints
Guido van Rossume2a383d2007-01-15 16:59:06 +0000381 if thisType == int:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000382 thisType = int
383
Skip Montanaro04ae7052003-04-24 20:21:31 +0000384 if thisType != columnTypes[col]:
385 if columnTypes[col] is None: # add new column type
386 columnTypes[col] = thisType
387 else:
388 # type is inconsistent, remove column from
389 # consideration
390 del columnTypes[col]
391
392 # finally, compare results against first row and "vote"
393 # on whether it's a header
394 hasHeader = 0
395 for col, colType in columnTypes.items():
396 if type(colType) == type(0): # it's a length
397 if len(header[col]) != colType:
398 hasHeader += 1
399 else:
400 hasHeader -= 1
401 else: # attempt typecast
402 try:
Raymond Hettinger39a55922003-06-12 03:01:55 +0000403 colType(header[col])
Raymond Hettingerf31cb0c2003-06-12 04:05:00 +0000404 except (ValueError, TypeError):
Skip Montanaro04ae7052003-04-24 20:21:31 +0000405 hasHeader += 1
406 else:
407 hasHeader -= 1
408
409 return hasHeader > 0