Blame - Lib/csv.py - platform/external/python/cpython2

2003-04-24 20:21:31 +0000

[diff] [blame]

1

2

"""

3

csv.py - read/write/investigate CSV files

4

"""

5

6

import re

Brett Cannon

9fc5631

2008-08-09 23:39:11 +0000

[diff] [blame]

7

from functools import reduce

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

8

from _csv import Error, __version__, writer, reader, register_dialect, \

9

unregister_dialect, get_dialect, list_dialects, \

Andrew McNamara

31d8896

2005-01-12 03:45:10 +0000

[diff] [blame]

10

field_size_limit, \

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

11

QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \

12

__doc__

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

13

from _csv import Dialect as _Dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

14

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

15

try:

16

from cStringIO import StringIO

17

except ImportError:

18

from StringIO import StringIO

19

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

20

__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",

Brett Cannon

88f801d

2008-08-18 00:46:22 +0000

[diff] [blame]

21

"Error", "Dialect", "__doc__", "excel", "excel_tab",

22

"field_size_limit", "reader", "writer",

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

23

"register_dialect", "get_dialect", "list_dialects", "Sniffer",

24

"unregister_dialect", "__version__", "DictReader", "DictWriter" ]

25

26

class Dialect:

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

27

"""Describe an Excel dialect.

28

29

This must be subclassed (see csv.excel). Valid attributes are:

30

delimiter, quotechar, escapechar, doublequote, skipinitialspace,

31

lineterminator, quoting.

32

33

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

_name = ""

_valid = False

# placeholders

delimiter = None

quotechar = None

escapechar = None

doublequote = None

skipinitialspace = None

42

lineterminator = None

quoting = None

def __init__(self):

if self.__class__ != Dialect:

47

self._valid = True

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

48

self._validate()

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

49

50

def _validate(self):

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

try:

_Dialect(self)

except TypeError, e:

# We do this for compatibility with py2.3

55

raise Error(str(e))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

56

57

class excel(Dialect):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

58

"""Describe the usual properties of Excel-generated CSV files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delimiter = ','

quotechar = '"'

doublequote = True

skipinitialspace = False

63

lineterminator = '\r\n'

64

quoting = QUOTE_MINIMAL

65

register_dialect("excel", excel)

66

67

class excel_tab(excel):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

68

"""Describe the usual properties of Excel-generated TAB-delimited files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

69

delimiter = '\t'

70

register_dialect("excel-tab", excel_tab)

71

72

73

class DictReader:

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

74

def __init__(self, f, fieldnames=None, restkey=None, restval=None,

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

75

dialect="excel", *args, **kwds):

Skip Montanaro

a032bf4

2008-08-08 22:52:51 +0000

[diff] [blame]

76

self._fieldnames = fieldnames # list of keys for the dict

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

77

self.restkey = restkey # key to catch long rows

78

self.restval = restval # default value for short rows

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

79

self.reader = reader(f, dialect, *args, **kwds)

Georg Brandl

77354cf

2008-03-21 20:01:51 +0000

[diff] [blame]

80

self.dialect = dialect

81

self.line_num = 0

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

def __iter__(self):

return self

Skip Montanaro

2008-08-08 22:52:51 +0000

[diff] [blame]

86

@property

87

def fieldnames(self):

88

if self._fieldnames is None:

89

try:

90

self._fieldnames = self.reader.next()

91

except StopIteration:

92

pass

93

self.line_num = self.reader.line_num

94

return self._fieldnames

95

96

@fieldnames.setter

97

def fieldnames(self, value):

98

self._fieldnames = value

99

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

100

def next(self):

Skip Montanaro

a032bf4

2008-08-08 22:52:51 +0000

[diff] [blame]

101

if self.line_num == 0:

102

# Used only for its side effect.

103

self.fieldnames

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

104

row = self.reader.next()

Georg Brandl

77354cf

2008-03-21 20:01:51 +0000

[diff] [blame]

105

self.line_num = self.reader.line_num

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

106

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

107

# unlike the basic reader, we prefer not to return blanks,

108

# because we will typically wind up with a dict full of None

109

# values

110

while row == []:

111

row = self.reader.next()

112

d = dict(zip(self.fieldnames, row))

113

lf = len(self.fieldnames)

114

lr = len(row)

115

if lf < lr:

116

d[self.restkey] = row[lf:]

117

elif lf > lr:

118

for key in self.fieldnames[lr:]:

119

d[key] = self.restval

return d

class DictWriter:

def __init__(self, f, fieldnames, restval="", extrasaction="raise",

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

125

dialect="excel", *args, **kwds):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

126

self.fieldnames = fieldnames # list of keys for the dict

127

self.restval = restval # for writing short dicts

128

if extrasaction.lower() not in ("raise", "ignore"):

129

raise ValueError, \

130

("extrasaction (%s) must be 'raise' or 'ignore'" %

131

extrasaction)

132

self.extrasaction = extrasaction

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

133

self.writer = writer(f, dialect, *args, **kwds)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

134

Dirkjan Ochtman

8614817

2010-02-23 21:09:52 +0000

[diff] [blame]

135

def writeheader(self):

136

header = dict(zip(self.fieldnames, self.fieldnames))

137

self.writerow(header)

138

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

139

def _dict_to_list(self, rowdict):

140

if self.extrasaction == "raise":

Georg Brandl

94fe3f5

2007-03-13 09:32:11 +0000

[diff] [blame]

141

wrong_fields = [k for k in rowdict if k not in self.fieldnames]

142

if wrong_fields:

143

raise ValueError("dict contains fields not in fieldnames: " +

144

", ".join(wrong_fields))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

145

return [rowdict.get(key, self.restval) for key in self.fieldnames]

146

147

def writerow(self, rowdict):

148

return self.writer.writerow(self._dict_to_list(rowdict))

149

150

def writerows(self, rowdicts):

151

rows = []

152

for rowdict in rowdicts:

153

rows.append(self._dict_to_list(rowdict))

154

return self.writer.writerows(rows)

155

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

156

# Guard Sniffer's type checking against builds that exclude complex()

try:

complex

except NameError:

complex = float

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

class Sniffer:

'''

"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

165

Returns a Dialect object.

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

166

'''

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

167

def __init__(self):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

168

# in case there is more than one possible delimiter

169

self.preferred = [',', '\t', ';', ' ', ':']

170

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

171

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

172

def sniff(self, sample, delimiters=None):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

173

"""

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

174

Returns a dialect (or None) corresponding to the sample

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

175

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

176

Skip Montanaro

b4fd4d3

2009-09-28 02:12:27 +0000

[diff] [blame]

177

quotechar, doublequote, delimiter, skipinitialspace = \

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

178

self._guess_quote_and_delimiter(sample, delimiters)

Skip Montanaro

39b29be

2005-12-30 05:09:48 +0000

[diff] [blame]

179

if not delimiter:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

180

delimiter, skipinitialspace = self._guess_delimiter(sample,

181

delimiters)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

182

Skip Montanaro

39b29be

2005-12-30 05:09:48 +0000

[diff] [blame]

183

if not delimiter:

184

raise Error, "Could not determine delimiter"

185

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

186

class dialect(Dialect):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

187

_name = "sniffed"

188

lineterminator = '\r\n'

Fred Drake

7c852f3

2003-04-25 14:27:00 +0000

[diff] [blame]

189

quoting = QUOTE_MINIMAL

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

190

# escapechar = ''

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

191

Skip Montanaro

b4fd4d3

2009-09-28 02:12:27 +0000

[diff] [blame]

192

dialect.doublequote = doublequote

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

193

dialect.delimiter = delimiter

194

# _csv.reader won't accept a quotechar of ''

195

dialect.quotechar = quotechar or '"'

196

dialect.skipinitialspace = skipinitialspace

197

198

return dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

199

200

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

201

def _guess_quote_and_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

202

"""

203

Looks for text enclosed between two identical quotes

204

(the probable quotechar) which are preceded and followed

205

by the same character (the probable delimiter).

206

For example:

207

,'some text',

208

The quote with the most wins, same with the delimiter.

209

If there is no quotechar the delimiter can't be determined

this way.

"""

matches = []

for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",

215

'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",

216

'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"

217

'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)

Fred Drake

6f7b213

2003-09-02 16:01:07 +0000

[diff] [blame]

218

regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

219

matches = regexp.findall(data)

if matches:

break

if not matches:

Skip Montanaro

b4fd4d3

2009-09-28 02:12:27 +0000

[diff] [blame]

224

# (quotechar, doublequote, delimiter, skipinitialspace)

225

return ('', False, None, 0)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

quotes = {}

delims = {}

spaces = 0

for m in matches:

n = regexp.groupindex['quote'] - 1

231

key = m[n]

232

if key:

233

quotes[key] = quotes.get(key, 0) + 1

234

try:

235

n = regexp.groupindex['delim'] - 1

236

key = m[n]

237

except KeyError:

238

continue

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

239

if key and (delimiters is None or key in delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

240

delims[key] = delims.get(key, 0) + 1

241

try:

242

n = regexp.groupindex['space'] - 1

except KeyError:

continue

if m[n]:

spaces += 1

quotechar = reduce(lambda a, b, quotes = quotes:

249

(quotes[a] > quotes[b]) and a or b, quotes.keys())

250

251

if delims:

252

delim = reduce(lambda a, b, delims = delims:

253

(delims[a] > delims[b]) and a or b, delims.keys())

254

skipinitialspace = delims[delim] == spaces

255

if delim == '\n': # most likely a file with a single column

256

delim = ''

257

else:

258

# there is *no* delimiter, it's a single column of quoted data

delim = ''

skipinitialspace = 0

Skip Montanaro

2009-09-28 02:12:27 +0000

[diff] [blame]

262

# if we see an extra quote between delimiters, we've got a

263

# double quoted format

264

dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \

265

{'delim':delim, 'quote':quotechar}, re.MULTILINE)

if dq_regexp.search(data):

doublequote = True

else:

doublequote = False

return (quotechar, doublequote, delim, skipinitialspace)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

275

276

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

277

def _guess_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

278

"""

279

The delimiter /should/ occur the same number of times on

280

each row. However, due to malformed data, it may not. We don't want

281

an all or nothing approach, so we allow for small variations in this

282

number.

283

1) build a table of the frequency of each character on every line.

Ezio Melotti

c2077b0

2011-03-16 12:34:31 +0200

[diff] [blame]

284

2) build a table of frequencies of this frequency (meta-frequency?),

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

285

e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,

286

7 times in 2 rows'

287

3) use the mode of the meta-frequency to determine the /expected/

288

frequency for that character

289

4) find out how often the character actually meets that goal

290

5) the character that best meets its goal is the delimiter

291

For performance reasons, the data is evaluated in chunks, so it can

292

try and evaluate the smallest portion of the data possible, evaluating

293

additional chunks as necessary.

294

"""

295

296

data = filter(None, data.split('\n'))

297

298

ascii = [chr(c) for c in range(127)] # 7-bit ASCII

299

300

# build frequency tables

301

chunkLength = min(10, len(data))

iteration = 0

charFrequency = {}

modes = {}

delims = {}

start, end = 0, min(chunkLength, len(data))

307

while start < len(data):

308

iteration += 1

309

for line in data[start:end]:

310

for char in ascii:

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

311

metaFrequency = charFrequency.get(char, {})

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

312

# must count even if frequency is 0

Skip Montanaro

91bb70c

2005-12-28 15:37:25 +0000

[diff] [blame]

313

freq = line.count(char)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

314

# value is the mode

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

315

metaFrequency[freq] = metaFrequency.get(freq, 0) + 1

316

charFrequency[char] = metaFrequency

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

317

318

for char in charFrequency.keys():

319

items = charFrequency[char].items()

320

if len(items) == 1 and items[0][0] == 0:

321

continue

322

# get the mode of the frequencies

323

if len(items) > 1:

324

modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,

325

items)

326

# adjust the mode - subtract the sum of all

327

# other frequencies

328

items.remove(modes[char])

329

modes[char] = (modes[char][0], modes[char][1]

330

- reduce(lambda a, b: (0, a[1] + b[1]),

331

items)[1])

332

else:

333

modes[char] = items[0]

334

335

# build a list of possible delimiters

336

modeList = modes.items()

337

total = float(chunkLength * iteration)

338

# (rows of consistent data) / (number of rows) = 100%

339

consistency = 1.0

340

# minimum consistency threshold

341

threshold = 0.9

342

while len(delims) == 0 and consistency >= threshold:

343

for k, v in modeList:

344

if v[0] > 0 and v[1] > 0:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

345

if ((v[1]/total) >= consistency and

346

(delimiters is None or k in delimiters)):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delims[k] = v

consistency -= 0.01

if len(delims) == 1:

delim = delims.keys()[0]

352

skipinitialspace = (data[0].count(delim) ==

353

data[0].count("%c " % delim))

354

return (delim, skipinitialspace)

355

356

# analyze another chunkLength lines

start = end

end += chunkLength

if not delims:

return ('', 0)

# if there's more than one, fall back to a 'preferred' list

364

if len(delims) > 1:

365

for d in self.preferred:

366

if d in delims.keys():

367

skipinitialspace = (data[0].count(d) ==

368

data[0].count("%c " % d))

369

return (d, skipinitialspace)

370

Skip Montanaro

39b29be

2005-12-30 05:09:48 +0000

[diff] [blame]

371

# nothing else indicates a preference, pick the character that

372

# dominates(?)

373

items = [(v,k) for (k,v) in delims.items()]

items.sort()

delim = items[-1][1]

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

377

skipinitialspace = (data[0].count(delim) ==

378

data[0].count("%c " % delim))

379

return (delim, skipinitialspace)

380

381

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

382

def has_header(self, sample):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

383

# Creates a dictionary of types of data in each column. If any

384

# column is of a single type (say, integers), *except* for the first

385

# row, then the first row is presumed to be labels. If the type

386

# can't be determined, it is assumed to be a string in which case

387

# the length of the string is the determining factor: if all of the

388

# rows except for the first are the same length, it's a header.

389

# Finally, a 'vote' is taken at the end for each column, adding or

390

# subtracting from the likelihood of the first row being a header.

391

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

392

rdr = reader(StringIO(sample), self.sniff(sample))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

393

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

394

header = rdr.next() # assume first row is header

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

395

396

columns = len(header)

397

columnTypes = {}

398

for i in range(columns): columnTypes[i] = None

399

400

checked = 0

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

401

for row in rdr:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

402

# arbitrary number of rows to check, to keep it sane

if checked > 20:

break

checked += 1

if len(row) != columns:

408

continue # skip rows that have irregular number of columns

409

410

for col in columnTypes.keys():

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

411

412

for thisType in [int, long, float, complex]:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

413

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

414

thisType(row[col])

415

break

Raymond Hettinger

abe14e6

2003-06-12 03:59:17 +0000

[diff] [blame]

416

except (ValueError, OverflowError):

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

417

pass

418

else:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

419

# fallback to length of string

420

thisType = len(row[col])

421

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

422

# treat longs as ints

if thisType == long:

thisType = int

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

426

if thisType != columnTypes[col]:

427

if columnTypes[col] is None: # add new column type

428

columnTypes[col] = thisType

429

else:

430

# type is inconsistent, remove column from

# consideration

del columnTypes[col]

# finally, compare results against first row and "vote"

435

# on whether it's a header

436

hasHeader = 0

437

for col, colType in columnTypes.items():

438

if type(colType) == type(0): # it's a length

439

if len(header[col]) != colType:

hasHeader += 1

else:

hasHeader -= 1

else: # attempt typecast

444

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

445

colType(header[col])

Raymond Hettinger

f31cb0c

2003-06-12 04:05:00 +0000

[diff] [blame]

446

except (ValueError, TypeError):

Skip Montanaro