Blame - Lib/csv.py - platform/external/python/cpython3

2003-04-24 20:21:31 +0000

[diff] [blame]

1

2

"""

3

csv.py - read/write/investigate CSV files

"""

import re

from _csv import Error, __version__, writer, reader, register_dialect, \

8

unregister_dialect, get_dialect, list_dialects, \

9

QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \

10

__doc__

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

11

from _csv import Dialect as _Dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

12

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

13

try:

14

from cStringIO import StringIO

15

except ImportError:

16

from StringIO import StringIO

17

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

18

__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",

19

"Error", "Dialect", "excel", "excel_tab", "reader", "writer",

20

"register_dialect", "get_dialect", "list_dialects", "Sniffer",

21

"unregister_dialect", "__version__", "DictReader", "DictWriter" ]

22

23

class Dialect:

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

24

"""Describe an Excel dialect.

25

26

This must be subclassed (see csv.excel). Valid attributes are:

27

delimiter, quotechar, escapechar, doublequote, skipinitialspace,

28

lineterminator, quoting.

29

30

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

_name = ""

_valid = False

# placeholders

delimiter = None

quotechar = None

escapechar = None

doublequote = None

skipinitialspace = None

39

lineterminator = None

quoting = None

def __init__(self):

if self.__class__ != Dialect:

44

self._valid = True

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

45

self._validate()

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

46

47

def _validate(self):

Andrew McNamara

7130ff5

2005-01-11 02:22:47 +0000

[diff] [blame]

try:

_Dialect(self)

except TypeError, e:

# We do this for compatibility with py2.3

52

raise Error(str(e))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

53

54

class excel(Dialect):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

55

"""Describe the usual properties of Excel-generated CSV files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delimiter = ','

quotechar = '"'

doublequote = True

skipinitialspace = False

60

lineterminator = '\r\n'

61

quoting = QUOTE_MINIMAL

62

register_dialect("excel", excel)

63

64

class excel_tab(excel):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

65

"""Describe the usual properties of Excel-generated TAB-delimited files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

66

delimiter = '\t'

67

register_dialect("excel-tab", excel_tab)

68

69

70

class DictReader:

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

71

def __init__(self, f, fieldnames=None, restkey=None, restval=None,

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

72

dialect="excel", *args, **kwds):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

73

self.fieldnames = fieldnames # list of keys for the dict

74

self.restkey = restkey # key to catch long rows

75

self.restval = restval # default value for short rows

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

76

self.reader = reader(f, dialect, *args, **kwds)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

def __iter__(self):

return self

def next(self):

row = self.reader.next()

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

83

if self.fieldnames is None:

84

self.fieldnames = row

85

row = self.reader.next()

86

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

87

# unlike the basic reader, we prefer not to return blanks,

88

# because we will typically wind up with a dict full of None

89

# values

90

while row == []:

91

row = self.reader.next()

92

d = dict(zip(self.fieldnames, row))

93

lf = len(self.fieldnames)

94

lr = len(row)

95

if lf < lr:

96

d[self.restkey] = row[lf:]

97

elif lf > lr:

98

for key in self.fieldnames[lr:]:

99

d[key] = self.restval

return d

class DictWriter:

def __init__(self, f, fieldnames, restval="", extrasaction="raise",

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

105

dialect="excel", *args, **kwds):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

106

self.fieldnames = fieldnames # list of keys for the dict

107

self.restval = restval # for writing short dicts

108

if extrasaction.lower() not in ("raise", "ignore"):

109

raise ValueError, \

110

("extrasaction (%s) must be 'raise' or 'ignore'" %

111

extrasaction)

112

self.extrasaction = extrasaction

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

113

self.writer = writer(f, dialect, *args, **kwds)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

114

115

def _dict_to_list(self, rowdict):

116

if self.extrasaction == "raise":

117

for k in rowdict.keys():

118

if k not in self.fieldnames:

119

raise ValueError, "dict contains fields not in fieldnames"

120

return [rowdict.get(key, self.restval) for key in self.fieldnames]

121

122

def writerow(self, rowdict):

123

return self.writer.writerow(self._dict_to_list(rowdict))

124

125

def writerows(self, rowdicts):

126

rows = []

127

for rowdict in rowdicts:

128

rows.append(self._dict_to_list(rowdict))

129

return self.writer.writerows(rows)

130

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

131

# Guard Sniffer's type checking against builds that exclude complex()

try:

complex

except NameError:

complex = float

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

class Sniffer:

'''

"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

140

Returns a Dialect object.

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

141

'''

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

142

def __init__(self):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

143

# in case there is more than one possible delimiter

144

self.preferred = [',', '\t', ';', ' ', ':']

145

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

146

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

147

def sniff(self, sample, delimiters=None):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

148

"""

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

149

Returns a dialect (or None) corresponding to the sample

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

150

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

151

152

quotechar, delimiter, skipinitialspace = \

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

153

self._guess_quote_and_delimiter(sample, delimiters)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

154

if delimiter is None:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

155

delimiter, skipinitialspace = self._guess_delimiter(sample,

156

delimiters)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

157

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

158

class dialect(Dialect):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

159

_name = "sniffed"

160

lineterminator = '\r\n'

Fred Drake

7c852f3

2003-04-25 14:27:00 +0000

[diff] [blame]

161

quoting = QUOTE_MINIMAL

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

162

# escapechar = ''

163

doublequote = False

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

164

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

165

dialect.delimiter = delimiter

166

# _csv.reader won't accept a quotechar of ''

167

dialect.quotechar = quotechar or '"'

168

dialect.skipinitialspace = skipinitialspace

169

170

return dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

171

172

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

173

def _guess_quote_and_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

174

"""

175

Looks for text enclosed between two identical quotes

176

(the probable quotechar) which are preceded and followed

177

by the same character (the probable delimiter).

178

For example:

179

,'some text',

180

The quote with the most wins, same with the delimiter.

181

If there is no quotechar the delimiter can't be determined

this way.

"""

matches = []

for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",

187

'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",

188

'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"

189

'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)

Fred Drake

6f7b213

2003-09-02 16:01:07 +0000

[diff] [blame]

190

regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

191

matches = regexp.findall(data)

if matches:

break

if not matches:

return ('', None, 0) # (quotechar, delimiter, skipinitialspace)

quotes = {}

delims = {}

spaces = 0

for m in matches:

n = regexp.groupindex['quote'] - 1

203

key = m[n]

204

if key:

205

quotes[key] = quotes.get(key, 0) + 1

206

try:

207

n = regexp.groupindex['delim'] - 1

208

key = m[n]

209

except KeyError:

210

continue

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

211

if key and (delimiters is None or key in delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

212

delims[key] = delims.get(key, 0) + 1

213

try:

214

n = regexp.groupindex['space'] - 1

except KeyError:

continue

if m[n]:

spaces += 1

quotechar = reduce(lambda a, b, quotes = quotes:

221

(quotes[a] > quotes[b]) and a or b, quotes.keys())

222

223

if delims:

224

delim = reduce(lambda a, b, delims = delims:

225

(delims[a] > delims[b]) and a or b, delims.keys())

226

skipinitialspace = delims[delim] == spaces

227

if delim == '\n': # most likely a file with a single column

228

delim = ''

229

else:

230

# there is *no* delimiter, it's a single column of quoted data

delim = ''

skipinitialspace = 0

return (quotechar, delim, skipinitialspace)

235

236

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

237

def _guess_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

238

"""

239

The delimiter /should/ occur the same number of times on

240

each row. However, due to malformed data, it may not. We don't want

241

an all or nothing approach, so we allow for small variations in this

242

number.

243

1) build a table of the frequency of each character on every line.

244

2) build a table of freqencies of this frequency (meta-frequency?),

245

e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,

246

7 times in 2 rows'

247

3) use the mode of the meta-frequency to determine the /expected/

248

frequency for that character

249

4) find out how often the character actually meets that goal

250

5) the character that best meets its goal is the delimiter

251

For performance reasons, the data is evaluated in chunks, so it can

252

try and evaluate the smallest portion of the data possible, evaluating

253

additional chunks as necessary.

254

"""

255

256

data = filter(None, data.split('\n'))

257

258

ascii = [chr(c) for c in range(127)] # 7-bit ASCII

259

260

# build frequency tables

261

chunkLength = min(10, len(data))

iteration = 0

charFrequency = {}

modes = {}

delims = {}

start, end = 0, min(chunkLength, len(data))

267

while start < len(data):

268

iteration += 1

269

for line in data[start:end]:

270

for char in ascii:

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

271

metaFrequency = charFrequency.get(char, {})

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

272

# must count even if frequency is 0

273

freq = line.strip().count(char)

274

# value is the mode

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

275

metaFrequency[freq] = metaFrequency.get(freq, 0) + 1

276

charFrequency[char] = metaFrequency

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

277

278

for char in charFrequency.keys():

279

items = charFrequency[char].items()

280

if len(items) == 1 and items[0][0] == 0:

281

continue

282

# get the mode of the frequencies

283

if len(items) > 1:

284

modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,

285

items)

286

# adjust the mode - subtract the sum of all

287

# other frequencies

288

items.remove(modes[char])

289

modes[char] = (modes[char][0], modes[char][1]

290

- reduce(lambda a, b: (0, a[1] + b[1]),

291

items)[1])

292

else:

293

modes[char] = items[0]

294

295

# build a list of possible delimiters

296

modeList = modes.items()

297

total = float(chunkLength * iteration)

298

# (rows of consistent data) / (number of rows) = 100%

299

consistency = 1.0

300

# minimum consistency threshold

301

threshold = 0.9

302

while len(delims) == 0 and consistency >= threshold:

303

for k, v in modeList:

304

if v[0] > 0 and v[1] > 0:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

305

if ((v[1]/total) >= consistency and

306

(delimiters is None or k in delimiters)):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delims[k] = v

consistency -= 0.01

if len(delims) == 1:

delim = delims.keys()[0]

312

skipinitialspace = (data[0].count(delim) ==

313

data[0].count("%c " % delim))

314

return (delim, skipinitialspace)

315

316

# analyze another chunkLength lines

start = end

end += chunkLength

if not delims:

return ('', 0)

# if there's more than one, fall back to a 'preferred' list

324

if len(delims) > 1:

325

for d in self.preferred:

326

if d in delims.keys():

327

skipinitialspace = (data[0].count(d) ==

328

data[0].count("%c " % d))

329

return (d, skipinitialspace)

330

331

# finally, just return the first damn character in the list

332

delim = delims.keys()[0]

333

skipinitialspace = (data[0].count(delim) ==

334

data[0].count("%c " % delim))

335

return (delim, skipinitialspace)

336

337

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

338

def has_header(self, sample):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

339

# Creates a dictionary of types of data in each column. If any

340

# column is of a single type (say, integers), *except* for the first

341

# row, then the first row is presumed to be labels. If the type

342

# can't be determined, it is assumed to be a string in which case

343

# the length of the string is the determining factor: if all of the

344

# rows except for the first are the same length, it's a header.

345

# Finally, a 'vote' is taken at the end for each column, adding or

346

# subtracting from the likelihood of the first row being a header.

347

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

348

rdr = reader(StringIO(sample), self.sniff(sample))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

349

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

350

header = rdr.next() # assume first row is header

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

351

352

columns = len(header)

353

columnTypes = {}

354

for i in range(columns): columnTypes[i] = None

355

356

checked = 0

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

357

for row in rdr:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

358

# arbitrary number of rows to check, to keep it sane

if checked > 20:

break

checked += 1

if len(row) != columns:

364

continue # skip rows that have irregular number of columns

365

366

for col in columnTypes.keys():

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

367

368

for thisType in [int, long, float, complex]:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

369

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

370

thisType(row[col])

371

break

Raymond Hettinger

abe14e6

2003-06-12 03:59:17 +0000

[diff] [blame]

372

except (ValueError, OverflowError):

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

373

pass

374

else:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

375

# fallback to length of string

376

thisType = len(row[col])

377

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

378

# treat longs as ints

if thisType == long:

thisType = int

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

382

if thisType != columnTypes[col]:

383

if columnTypes[col] is None: # add new column type

384

columnTypes[col] = thisType

385

else:

386

# type is inconsistent, remove column from

# consideration

del columnTypes[col]

# finally, compare results against first row and "vote"

391

# on whether it's a header

392

hasHeader = 0

393

for col, colType in columnTypes.items():

394

if type(colType) == type(0): # it's a length

395

if len(header[col]) != colType:

hasHeader += 1

else:

hasHeader -= 1

else: # attempt typecast

400

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

401

colType(header[col])

Raymond Hettinger

f31cb0c

2003-06-12 04:05:00 +0000

[diff] [blame]

402

except (ValueError, TypeError):

Skip Montanaro