blob: c8e84813965c17318adc283b8cfa8876394562b8 [file] [log] [blame]
Tor Norbye3a2425a2013-11-04 10:16:08 -08001
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 field_size_limit, \
10 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
12from _csv import Dialect as _Dialect
13
14try:
15 from cStringIO import StringIO
16except ImportError:
17 from StringIO import StringIO
18
19__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23
24class Dialect:
25 """Describe an Excel dialect.
26
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
30
31 """
32 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
42
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
46 self._validate()
47
48 def _validate(self):
49 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
54
55class excel(Dialect):
56 """Describe the usual properties of Excel-generated CSV files."""
57 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63register_dialect("excel", excel)
64
65class excel_tab(excel):
66 """Describe the usual properties of Excel-generated TAB-delimited files."""
67 delimiter = '\t'
68register_dialect("excel-tab", excel_tab)
69
70
71class DictReader:
72 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
73 dialect="excel", *args, **kwds):
74 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
77 self.reader = reader(f, dialect, *args, **kwds)
78 self.dialect = dialect
79 self.line_num = 0
80
81 def __iter__(self):
82 return self
83
84 def next(self):
85 row = self.reader.next()
86 if self.fieldnames is None:
87 self.fieldnames = row
88 row = self.reader.next()
89 self.line_num = self.reader.line_num
90
91 # unlike the basic reader, we prefer not to return blanks,
92 # because we will typically wind up with a dict full of None
93 # values
94 while row == []:
95 row = self.reader.next()
96 d = dict(zip(self.fieldnames, row))
97 lf = len(self.fieldnames)
98 lr = len(row)
99 if lf < lr:
100 d[self.restkey] = row[lf:]
101 elif lf > lr:
102 for key in self.fieldnames[lr:]:
103 d[key] = self.restval
104 return d
105
106
107class DictWriter:
108 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
109 dialect="excel", *args, **kwds):
110 self.fieldnames = fieldnames # list of keys for the dict
111 self.restval = restval # for writing short dicts
112 if extrasaction.lower() not in ("raise", "ignore"):
113 raise ValueError, \
114 ("extrasaction (%s) must be 'raise' or 'ignore'" %
115 extrasaction)
116 self.extrasaction = extrasaction
117 self.writer = writer(f, dialect, *args, **kwds)
118
119 def _dict_to_list(self, rowdict):
120 if self.extrasaction == "raise":
121 for k in rowdict.keys():
122 if k not in self.fieldnames:
123 raise ValueError, "dict contains fields not in fieldnames"
124 return [rowdict.get(key, self.restval) for key in self.fieldnames]
125
126 def writerow(self, rowdict):
127 return self.writer.writerow(self._dict_to_list(rowdict))
128
129 def writerows(self, rowdicts):
130 rows = []
131 for rowdict in rowdicts:
132 rows.append(self._dict_to_list(rowdict))
133 return self.writer.writerows(rows)
134
135# Guard Sniffer's type checking against builds that exclude complex()
136try:
137 complex
138except NameError:
139 complex = float
140
141class Sniffer:
142 '''
143 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
144 Returns a Dialect object.
145 '''
146 def __init__(self):
147 # in case there is more than one possible delimiter
148 self.preferred = [',', '\t', ';', ' ', ':']
149
150
151 def sniff(self, sample, delimiters=None):
152 """
153 Returns a dialect (or None) corresponding to the sample
154 """
155
156 quotechar, delimiter, skipinitialspace = \
157 self._guess_quote_and_delimiter(sample, delimiters)
158 if not delimiter:
159 delimiter, skipinitialspace = self._guess_delimiter(sample,
160 delimiters)
161
162 if not delimiter:
163 raise Error, "Could not determine delimiter"
164
165 class dialect(Dialect):
166 _name = "sniffed"
167 lineterminator = '\r\n'
168 quoting = QUOTE_MINIMAL
169 # escapechar = ''
170 doublequote = False
171
172 dialect.delimiter = delimiter
173 # _csv.reader won't accept a quotechar of ''
174 dialect.quotechar = quotechar or '"'
175 dialect.skipinitialspace = skipinitialspace
176
177 return dialect
178
179
180 def _guess_quote_and_delimiter(self, data, delimiters):
181 """
182 Looks for text enclosed between two identical quotes
183 (the probable quotechar) which are preceded and followed
184 by the same character (the probable delimiter).
185 For example:
186 ,'some text',
187 The quote with the most wins, same with the delimiter.
188 If there is no quotechar the delimiter can't be determined
189 this way.
190 """
191
192 matches = []
193 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
194 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
195 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
196 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
197 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
198 matches = regexp.findall(data)
199 if matches:
200 break
201
202 if not matches:
203 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
204
205 quotes = {}
206 delims = {}
207 spaces = 0
208 for m in matches:
209 n = regexp.groupindex['quote'] - 1
210 key = m[n]
211 if key:
212 quotes[key] = quotes.get(key, 0) + 1
213 try:
214 n = regexp.groupindex['delim'] - 1
215 key = m[n]
216 except KeyError:
217 continue
218 if key and (delimiters is None or key in delimiters):
219 delims[key] = delims.get(key, 0) + 1
220 try:
221 n = regexp.groupindex['space'] - 1
222 except KeyError:
223 continue
224 if m[n]:
225 spaces += 1
226
227 quotechar = reduce(lambda a, b, quotes = quotes:
228 (quotes[a] > quotes[b]) and a or b, quotes.keys())
229
230 if delims:
231 delim = reduce(lambda a, b, delims = delims:
232 (delims[a] > delims[b]) and a or b, delims.keys())
233 skipinitialspace = delims[delim] == spaces
234 if delim == '\n': # most likely a file with a single column
235 delim = ''
236 else:
237 # there is *no* delimiter, it's a single column of quoted data
238 delim = ''
239 skipinitialspace = 0
240
241 return (quotechar, delim, skipinitialspace)
242
243
244 def _guess_delimiter(self, data, delimiters):
245 """
246 The delimiter /should/ occur the same number of times on
247 each row. However, due to malformed data, it may not. We don't want
248 an all or nothing approach, so we allow for small variations in this
249 number.
250 1) build a table of the frequency of each character on every line.
251 2) build a table of freqencies of this frequency (meta-frequency?),
252 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
253 7 times in 2 rows'
254 3) use the mode of the meta-frequency to determine the /expected/
255 frequency for that character
256 4) find out how often the character actually meets that goal
257 5) the character that best meets its goal is the delimiter
258 For performance reasons, the data is evaluated in chunks, so it can
259 try and evaluate the smallest portion of the data possible, evaluating
260 additional chunks as necessary.
261 """
262
263 data = filter(None, data.split('\n'))
264
265 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
266
267 # build frequency tables
268 chunkLength = min(10, len(data))
269 iteration = 0
270 charFrequency = {}
271 modes = {}
272 delims = {}
273 start, end = 0, min(chunkLength, len(data))
274 while start < len(data):
275 iteration += 1
276 for line in data[start:end]:
277 for char in ascii:
278 metaFrequency = charFrequency.get(char, {})
279 # must count even if frequency is 0
280 freq = line.count(char)
281 # value is the mode
282 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
283 charFrequency[char] = metaFrequency
284
285 for char in charFrequency.keys():
286 items = charFrequency[char].items()
287 if len(items) == 1 and items[0][0] == 0:
288 continue
289 # get the mode of the frequencies
290 if len(items) > 1:
291 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
292 items)
293 # adjust the mode - subtract the sum of all
294 # other frequencies
295 items.remove(modes[char])
296 modes[char] = (modes[char][0], modes[char][1]
297 - reduce(lambda a, b: (0, a[1] + b[1]),
298 items)[1])
299 else:
300 modes[char] = items[0]
301
302 # build a list of possible delimiters
303 modeList = modes.items()
304 total = float(chunkLength * iteration)
305 # (rows of consistent data) / (number of rows) = 100%
306 consistency = 1.0
307 # minimum consistency threshold
308 threshold = 0.9
309 while len(delims) == 0 and consistency >= threshold:
310 for k, v in modeList:
311 if v[0] > 0 and v[1] > 0:
312 if ((v[1]/total) >= consistency and
313 (delimiters is None or k in delimiters)):
314 delims[k] = v
315 consistency -= 0.01
316
317 if len(delims) == 1:
318 delim = delims.keys()[0]
319 skipinitialspace = (data[0].count(delim) ==
320 data[0].count("%c " % delim))
321 return (delim, skipinitialspace)
322
323 # analyze another chunkLength lines
324 start = end
325 end += chunkLength
326
327 if not delims:
328 return ('', 0)
329
330 # if there's more than one, fall back to a 'preferred' list
331 if len(delims) > 1:
332 for d in self.preferred:
333 if d in delims.keys():
334 skipinitialspace = (data[0].count(d) ==
335 data[0].count("%c " % d))
336 return (d, skipinitialspace)
337
338 # nothing else indicates a preference, pick the character that
339 # dominates(?)
340 items = [(v,k) for (k,v) in delims.items()]
341 items.sort()
342 delim = items[-1][1]
343
344 skipinitialspace = (data[0].count(delim) ==
345 data[0].count("%c " % delim))
346 return (delim, skipinitialspace)
347
348
349 def has_header(self, sample):
350 # Creates a dictionary of types of data in each column. If any
351 # column is of a single type (say, integers), *except* for the first
352 # row, then the first row is presumed to be labels. If the type
353 # can't be determined, it is assumed to be a string in which case
354 # the length of the string is the determining factor: if all of the
355 # rows except for the first are the same length, it's a header.
356 # Finally, a 'vote' is taken at the end for each column, adding or
357 # subtracting from the likelihood of the first row being a header.
358
359 rdr = reader(StringIO(sample), self.sniff(sample))
360
361 header = rdr.next() # assume first row is header
362
363 columns = len(header)
364 columnTypes = {}
365 for i in range(columns): columnTypes[i] = None
366
367 checked = 0
368 for row in rdr:
369 # arbitrary number of rows to check, to keep it sane
370 if checked > 20:
371 break
372 checked += 1
373
374 if len(row) != columns:
375 continue # skip rows that have irregular number of columns
376
377 for col in columnTypes.keys():
378
379 for thisType in [int, long, float, complex]:
380 try:
381 thisType(row[col])
382 break
383 except (ValueError, OverflowError):
384 pass
385 else:
386 # fallback to length of string
387 thisType = len(row[col])
388
389 # treat longs as ints
390 if thisType == long:
391 thisType = int
392
393 if thisType != columnTypes[col]:
394 if columnTypes[col] is None: # add new column type
395 columnTypes[col] = thisType
396 else:
397 # type is inconsistent, remove column from
398 # consideration
399 del columnTypes[col]
400
401 # finally, compare results against first row and "vote"
402 # on whether it's a header
403 hasHeader = 0
404 for col, colType in columnTypes.items():
405 if type(colType) == type(0): # it's a length
406 if len(header[col]) != colType:
407 hasHeader += 1
408 else:
409 hasHeader -= 1
410 else: # attempt typecast
411 try:
412 colType(header[col])
413 except (ValueError, TypeError):
414 hasHeader += 1
415 else:
416 hasHeader -= 1
417
418 return hasHeader > 0