Blame - Lib/csv.py - platform/external/python/cpython3

2003-04-24 20:21:31 +0000

[diff] [blame]

1

2

"""

3

csv.py - read/write/investigate CSV files

"""

import re

from _csv import Error, __version__, writer, reader, register_dialect, \

8

unregister_dialect, get_dialect, list_dialects, \

Andrew McNamara

31d8896

2005-01-12 03:45:10 +0000

[diff] [blame]

9

field_size_limit, \

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

10

QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \

11

__doc__

Andrew McNamara

2005-01-11 02:22:47 +0000

[diff] [blame]

12

from _csv import Dialect as _Dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

13

Raymond Hettinger

43ca452

2016-08-30 12:35:50 -0700

[diff] [blame]

14

from collections import OrderedDict

Guido van Rossum

68937b4

2007-05-18 00:51:22 +0000

[diff] [blame]

15

from io import StringIO

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

16

Martin Panter

19e69c5

2015-11-14 12:46:42 +0000

[diff] [blame]

17

__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",

18

"Error", "Dialect", "__doc__", "excel", "excel_tab",

19

"field_size_limit", "reader", "writer",

20

"register_dialect", "get_dialect", "list_dialects", "Sniffer",

21

"unregister_dialect", "__version__", "DictReader", "DictWriter",

22

"unix_dialect"]

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

23

24

class Dialect:

Georg Brandl

7424dd3

2010-10-27 07:27:06 +0000

[diff] [blame]

25

"""Describe a CSV dialect.

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

26

27

This must be subclassed (see csv.excel). Valid attributes are:

28

delimiter, quotechar, escapechar, doublequote, skipinitialspace,

29

lineterminator, quoting.

30

31

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

_name = ""

_valid = False

# placeholders

delimiter = None

quotechar = None

escapechar = None

doublequote = None

skipinitialspace = None

40

lineterminator = None

quoting = None

def __init__(self):

if self.__class__ != Dialect:

45

self._valid = True

Andrew McNamara

2005-01-11 02:22:47 +0000

[diff] [blame]

46

self._validate()

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

47

48

def _validate(self):

Andrew McNamara

2005-01-11 02:22:47 +0000

[diff] [blame]

49

try:

50

_Dialect(self)

Guido van Rossum

b940e11

2007-01-10 16:19:56 +0000

[diff] [blame]

51

except TypeError as e:

Andrew McNamara

2005-01-11 02:22:47 +0000

[diff] [blame]

52

# We do this for compatibility with py2.3

53

raise Error(str(e))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

54

55

class excel(Dialect):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

56

"""Describe the usual properties of Excel-generated CSV files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delimiter = ','

quotechar = '"'

doublequote = True

skipinitialspace = False

61

lineterminator = '\r\n'

62

quoting = QUOTE_MINIMAL

63

register_dialect("excel", excel)

64

65

class excel_tab(excel):

Skip Montanaro

f26285c

2005-01-05 06:54:58 +0000

[diff] [blame]

66

"""Describe the usual properties of Excel-generated TAB-delimited files."""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

67

delimiter = '\t'

68

register_dialect("excel-tab", excel_tab)

69

Georg Brandl

7424dd3

2010-10-27 07:27:06 +0000

[diff] [blame]

70

class unix_dialect(Dialect):

71

"""Describe the usual properties of Unix-generated CSV files."""

delimiter = ','

quotechar = '"'

doublequote = True

skipinitialspace = False

76

lineterminator = '\n'

77

quoting = QUOTE_ALL

78

register_dialect("unix", unix_dialect)

79

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

80

81

class DictReader:

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

82

def __init__(self, f, fieldnames=None, restkey=None, restval=None,

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

83

dialect="excel", *args, **kwds):

Skip Montanaro

af8fcfa

2008-08-09 19:44:22 +0000

[diff] [blame]

84

self._fieldnames = fieldnames # list of keys for the dict

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

85

self.restkey = restkey # key to catch long rows

86

self.restval = restval # default value for short rows

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

87

self.reader = reader(f, dialect, *args, **kwds)

Christian Heimes

4fbc72b

2008-03-22 00:47:35 +0000

[diff] [blame]

88

self.dialect = dialect

89

self.line_num = 0

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

def __iter__(self):

return self

Skip Montanaro

2008-08-09 19:44:22 +0000

[diff] [blame]

94

@property

95

def fieldnames(self):

96

if self._fieldnames is None:

97

try:

98

self._fieldnames = next(self.reader)

99

except StopIteration:

100

pass

101

self.line_num = self.reader.line_num

102

return self._fieldnames

103

104

@fieldnames.setter

105

def fieldnames(self, value):

106

self._fieldnames = value

107

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

108

def __next__(self):

Skip Montanaro

af8fcfa

2008-08-09 19:44:22 +0000

[diff] [blame]

109

if self.line_num == 0:

110

# Used only for its side effect.

111

self.fieldnames

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

112

row = next(self.reader)

Christian Heimes

4fbc72b

2008-03-22 00:47:35 +0000

[diff] [blame]

113

self.line_num = self.reader.line_num

Skip Montanaro

dffeed3

2003-10-03 14:03:01 +0000

[diff] [blame]

114

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

115

# unlike the basic reader, we prefer not to return blanks,

116

# because we will typically wind up with a dict full of None

117

# values

118

while row == []:

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

119

row = next(self.reader)

Raymond Hettinger

43ca452

2016-08-30 12:35:50 -0700

[diff] [blame]

120

d = OrderedDict(zip(self.fieldnames, row))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

121

lf = len(self.fieldnames)

122

lr = len(row)

123

if lf < lr:

124

d[self.restkey] = row[lf:]

125

elif lf > lr:

126

for key in self.fieldnames[lr:]:

127

d[key] = self.restval

return d

class DictWriter:

def __init__(self, f, fieldnames, restval="", extrasaction="raise",

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

133

dialect="excel", *args, **kwds):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

134

self.fieldnames = fieldnames # list of keys for the dict

135

self.restval = restval # for writing short dicts

136

if extrasaction.lower() not in ("raise", "ignore"):

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

137

raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"

138

% extrasaction)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

139

self.extrasaction = extrasaction

Skip Montanaro

2003-09-06 19:52:12 +0000

[diff] [blame]

140

self.writer = writer(f, dialect, *args, **kwds)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

141

R. David Murray

be0698b

2010-02-23 22:57:58 +0000

[diff] [blame]

142

def writeheader(self):

143

header = dict(zip(self.fieldnames, self.fieldnames))

144

self.writerow(header)

145

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

146

def _dict_to_list(self, rowdict):

147

if self.extrasaction == "raise":

INADA Naoki

0a421a2

2016-10-21 19:47:57 +0900

[diff] [blame]

148

wrong_fields = rowdict.keys() - self.fieldnames

Guido van Rossum

d8faa36

2007-04-27 19:54:29 +0000

[diff] [blame]

149

if wrong_fields:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

150

raise ValueError("dict contains fields not in fieldnames: "

R David Murray

fb099c9

2013-11-19 13:16:20 -0500

[diff] [blame]

151

+ ", ".join([repr(x) for x in wrong_fields]))

Serhiy Storchaka

7901b48

2015-03-30 09:09:54 +0300

[diff] [blame]

152

return (rowdict.get(key, self.restval) for key in self.fieldnames)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

153

154

def writerow(self, rowdict):

155

return self.writer.writerow(self._dict_to_list(rowdict))

156

157

def writerows(self, rowdicts):

Serhiy Storchaka

7901b48

2015-03-30 09:09:54 +0300

[diff] [blame]

158

return self.writer.writerows(map(self._dict_to_list, rowdicts))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

159

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

160

# Guard Sniffer's type checking against builds that exclude complex()

try:

complex

except NameError:

complex = float

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

class Sniffer:

'''

"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

169

Returns a Dialect object.

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

170

'''

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

171

def __init__(self):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

172

# in case there is more than one possible delimiter

173

self.preferred = [',', '\t', ';', ' ', ':']

174

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

175

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

176

def sniff(self, sample, delimiters=None):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

177

"""

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

178

Returns a dialect (or None) corresponding to the sample

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

179

"""

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

180

Benjamin Peterson

2009-10-04 14:49:41 +0000

[diff] [blame]

181

quotechar, doublequote, delimiter, skipinitialspace = \

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

182

self._guess_quote_and_delimiter(sample, delimiters)

Skip Montanaro

2005-12-30 05:09:48 +0000

[diff] [blame]

183

if not delimiter:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

184

delimiter, skipinitialspace = self._guess_delimiter(sample,

185

delimiters)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

186

Skip Montanaro

2005-12-30 05:09:48 +0000

[diff] [blame]

187

if not delimiter:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

188

raise Error("Could not determine delimiter")

Skip Montanaro

2005-12-30 05:09:48 +0000

[diff] [blame]

189

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

190

class dialect(Dialect):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

191

_name = "sniffed"

192

lineterminator = '\r\n'

Fred Drake

7c852f3

2003-04-25 14:27:00 +0000

[diff] [blame]

193

quoting = QUOTE_MINIMAL

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

194

# escapechar = ''

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

195

Benjamin Peterson

2009-10-04 14:49:41 +0000

[diff] [blame]

196

dialect.doublequote = doublequote

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

197

dialect.delimiter = delimiter

198

# _csv.reader won't accept a quotechar of ''

199

dialect.quotechar = quotechar or '"'

200

dialect.skipinitialspace = skipinitialspace

201

202

return dialect

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

203

204

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

205

def _guess_quote_and_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

206

"""

207

Looks for text enclosed between two identical quotes

208

(the probable quotechar) which are preceded and followed

209

by the same character (the probable delimiter).

210

For example:

211

,'some text',

212

The quote with the most wins, same with the delimiter.

213

If there is no quotechar the delimiter can't be determined

this way.

"""

matches = []

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame]

218

for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",

219

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",

220

r'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"

221

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)

Fred Drake

6f7b213

2003-09-02 16:01:07 +0000

[diff] [blame]

222

regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

223

matches = regexp.findall(data)

if matches:

break

if not matches:

Benjamin Peterson

2009-10-04 14:49:41 +0000

[diff] [blame]

228

# (quotechar, doublequote, delimiter, skipinitialspace)

229

return ('', False, None, 0)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

230

quotes = {}

231

delims = {}

232

spaces = 0

Serhiy Storchaka

2015-03-30 01:01:48 +0300

[diff] [blame]

233

groupindex = regexp.groupindex

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

234

for m in matches:

Serhiy Storchaka

2015-03-30 01:01:48 +0300

[diff] [blame]

235

n = groupindex['quote'] - 1

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

236

key = m[n]

237

if key:

238

quotes[key] = quotes.get(key, 0) + 1

239

try:

Serhiy Storchaka

2015-03-30 01:01:48 +0300

[diff] [blame]

240

n = groupindex['delim'] - 1

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

241

key = m[n]

242

except KeyError:

243

continue

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

244

if key and (delimiters is None or key in delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

245

delims[key] = delims.get(key, 0) + 1

246

try:

Serhiy Storchaka

2015-03-30 01:01:48 +0300

[diff] [blame]

247

n = groupindex['space'] - 1

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

except KeyError:

continue

if m[n]:

spaces += 1

Guido van Rossum

2006-08-22 00:21:25 +0000

[diff] [blame]

253

quotechar = max(quotes, key=quotes.get)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

254

255

if delims:

Guido van Rossum

89da5d7

2006-08-22 00:21:25 +0000

[diff] [blame]

256

delim = max(delims, key=delims.get)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

257

skipinitialspace = delims[delim] == spaces

258

if delim == '\n': # most likely a file with a single column

259

delim = ''

260

else:

261

# there is *no* delimiter, it's a single column of quoted data

delim = ''

skipinitialspace = 0

Benjamin Peterson

2009-10-04 14:49:41 +0000

[diff] [blame]

265

# if we see an extra quote between delimiters, we've got a

266

# double quoted format

R David Murray

925a322

2013-06-29 18:40:53 -0400

[diff] [blame]

267

dq_regexp = re.compile(

268

r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \

269

{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)

Benjamin Peterson

2009-10-04 14:49:41 +0000

[diff] [blame]

if dq_regexp.search(data):

doublequote = True

else:

doublequote = False

return (quotechar, doublequote, delim, skipinitialspace)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

279

280

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

281

def _guess_delimiter(self, data, delimiters):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

282

"""

283

The delimiter /should/ occur the same number of times on

284

each row. However, due to malformed data, it may not. We don't want

285

an all or nothing approach, so we allow for small variations in this

286

number.

287

1) build a table of the frequency of each character on every line.

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

288

2) build a table of frequencies of this frequency (meta-frequency?),

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

289

e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,

290

7 times in 2 rows'

291

3) use the mode of the meta-frequency to determine the /expected/

292

frequency for that character

293

4) find out how often the character actually meets that goal

294

5) the character that best meets its goal is the delimiter

295

For performance reasons, the data is evaluated in chunks, so it can

296

try and evaluate the smallest portion of the data possible, evaluating

297

additional chunks as necessary.

298

"""

299

Guido van Rossum

c1f779c

2007-07-03 08:25:58 +0000

[diff] [blame]

300

data = list(filter(None, data.split('\n')))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

301

302

ascii = [chr(c) for c in range(127)] # 7-bit ASCII

303

304

# build frequency tables

305

chunkLength = min(10, len(data))

iteration = 0

charFrequency = {}

modes = {}

delims = {}

start, end = 0, min(chunkLength, len(data))

311

while start < len(data):

312

iteration += 1

313

for line in data[start:end]:

314

for char in ascii:

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

315

metaFrequency = charFrequency.get(char, {})

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

316

# must count even if frequency is 0

Skip Montanaro

91bb70c

2005-12-28 15:37:25 +0000

[diff] [blame]

317

freq = line.count(char)

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

318

# value is the mode

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

319

metaFrequency[freq] = metaFrequency.get(freq, 0) + 1

320

charFrequency[char] = metaFrequency

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

321

322

for char in charFrequency.keys():

Guido van Rossum

cc2b016

2007-02-11 06:12:03 +0000

[diff] [blame]

323

items = list(charFrequency[char].items())

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

324

if len(items) == 1 and items[0][0] == 0:

325

continue

326

# get the mode of the frequencies

327

if len(items) > 1:

Guido van Rossum

89da5d7

2006-08-22 00:21:25 +0000

[diff] [blame]

328

modes[char] = max(items, key=lambda x: x[1])

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

329

# adjust the mode - subtract the sum of all

330

# other frequencies

331

items.remove(modes[char])

332

modes[char] = (modes[char][0], modes[char][1]

Guido van Rossum

89da5d7

2006-08-22 00:21:25 +0000

[diff] [blame]

333

- sum(item[1] for item in items))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

334

else:

335

modes[char] = items[0]

336

337

# build a list of possible delimiters

338

modeList = modes.items()

339

total = float(chunkLength * iteration)

340

# (rows of consistent data) / (number of rows) = 100%

341

consistency = 1.0

342

# minimum consistency threshold

343

threshold = 0.9

344

while len(delims) == 0 and consistency >= threshold:

345

for k, v in modeList:

346

if v[0] > 0 and v[1] > 0:

Skip Montanaro

2003-05-19 15:33:36 +0000

[diff] [blame]

347

if ((v[1]/total) >= consistency and

348

(delimiters is None or k in delimiters)):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

delims[k] = v

consistency -= 0.01

if len(delims) == 1:

Guido van Rossum

cc2b016

2007-02-11 06:12:03 +0000

[diff] [blame]

353

delim = list(delims.keys())[0]

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

354

skipinitialspace = (data[0].count(delim) ==

355

data[0].count("%c " % delim))

356

return (delim, skipinitialspace)

357

358

# analyze another chunkLength lines

start = end

end += chunkLength

if not delims:

return ('', 0)

# if there's more than one, fall back to a 'preferred' list

366

if len(delims) > 1:

367

for d in self.preferred:

368

if d in delims.keys():

369

skipinitialspace = (data[0].count(d) ==

370

data[0].count("%c " % d))

371

return (d, skipinitialspace)

372

Skip Montanaro

2005-12-30 05:09:48 +0000

[diff] [blame]

373

# nothing else indicates a preference, pick the character that

374

# dominates(?)

375

items = [(v,k) for (k,v) in delims.items()]

items.sort()

delim = items[-1][1]

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

379

skipinitialspace = (data[0].count(delim) ==

380

data[0].count("%c " % delim))

381

return (delim, skipinitialspace)

382

383

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

384

def has_header(self, sample):

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

385

# Creates a dictionary of types of data in each column. If any

386

# column is of a single type (say, integers), *except* for the first

387

# row, then the first row is presumed to be labels. If the type

388

# can't be determined, it is assumed to be a string in which case

389

# the length of the string is the determining factor: if all of the

390

# rows except for the first are the same length, it's a header.

391

# Finally, a 'vote' is taken at the end for each column, adding or

392

# subtracting from the likelihood of the first row being a header.

393

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

394

rdr = reader(StringIO(sample), self.sniff(sample))

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

395

Georg Brandl

2007-04-21 15:47:16 +0000

[diff] [blame]

396

header = next(rdr) # assume first row is header

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

397

398

columns = len(header)

399

columnTypes = {}

400

for i in range(columns): columnTypes[i] = None

401

402

checked = 0

Skip Montanaro

2003-04-25 14:47:16 +0000

[diff] [blame]

403

for row in rdr:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

404

# arbitrary number of rows to check, to keep it sane

if checked > 20:

break

checked += 1

if len(row) != columns:

410

continue # skip rows that have irregular number of columns

411

Guido van Rossum

cc2b016

2007-02-11 06:12:03 +0000

[diff] [blame]

412

for col in list(columnTypes.keys()):

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

413

Amaury Forgeot d'Arc

a461873

2008-04-24 18:26:53 +0000

[diff] [blame]

414

for thisType in [int, float, complex]:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

415

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

416

thisType(row[col])

417

break

Raymond Hettinger

abe14e6

2003-06-12 03:59:17 +0000

[diff] [blame]

418

except (ValueError, OverflowError):

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

419

pass

420

else:

Skip Montanaro

2003-04-24 20:21:31 +0000

[diff] [blame]

421

# fallback to length of string

422

thisType = len(row[col])

423

424

if thisType != columnTypes[col]:

425

if columnTypes[col] is None: # add new column type

426

columnTypes[col] = thisType

427

else:

428

# type is inconsistent, remove column from

# consideration

del columnTypes[col]

# finally, compare results against first row and "vote"

433

# on whether it's a header

434

hasHeader = 0

435

for col, colType in columnTypes.items():

436

if type(colType) == type(0): # it's a length

437

if len(header[col]) != colType:

hasHeader += 1

else:

hasHeader -= 1

else: # attempt typecast

442

try:

Raymond Hettinger

2003-06-12 03:01:55 +0000

[diff] [blame]

443

colType(header[col])

Raymond Hettinger

f31cb0c

2003-06-12 04:05:00 +0000

[diff] [blame]

444

except (ValueError, TypeError):

Skip Montanaro