Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

2008-03-19 05:04:44 +0000

[diff] [blame]

"""Tokenization help for Python programs.

5

6

generate_tokens(readline) is a generator that breaks a stream of

7

text into Python tokens. It accepts a readline-like method which is called

8

repeatedly to get the next line of input (or "" for EOF). It generates

9

5-tuples with these members:

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

operators

Older entry points

tokenize_loop(readline, tokeneater)

23

tokenize(readline, tokeneater=printtoken)

24

are the same, except instead of generating tokens, tokeneater is a callback

25

function to which the 5 fields described above are passed as 5 arguments,

26

each time a new token is found."""

27

28

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

29

__credits__ = \

30

'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

31

32

import string, re

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

33

from codecs import BOM_UTF8, lookup

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

34

from lib2to3.pgen2.token import *

35

36

from . import token

37

__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",

38

"generate_tokens", "untokenize"]

39

del token

40

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

try:

bytes

except NameError:

# Support bytes type in Python <= 2.5, so 2to3 turns itself into

45

# valid Python 3 code.

46

bytes = str

47

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

48

def group(*choices): return '(' + '|'.join(choices) + ')'

49

def any(*choices): return group(*choices) + '*'

50

def maybe(*choices): return group(*choices) + '?'

51

52

Whitespace = r'[ \f\t]*'

53

Comment = r'#[^\r\n]*'

54

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

55

Name = r'[a-zA-Z_]\w*'

56

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

57

Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'

58

Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'

59

Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'

60

Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

61

Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

62

Exponent = r'[eE][-+]?\d+(?:_\d+)*'

63

Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)

64

Expfloat = r'\d+(?:_\d+)*' + Exponent

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

65

Floatnumber = group(Pointfloat, Expfloat)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

66

Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

67

Number = group(Imagnumber, Floatnumber, Intnumber)

68

69

# Tail end of ' string.

70

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

71

# Tail end of " string.

72

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

73

# Tail end of ''' string.

74

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

75

# Tail end of """ string.

76

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

77

_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

78

Triple = group(_litprefix + "'''", _litprefix + '"""')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

79

# Single-line ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

80

String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

81

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

82

83

# Because of leftmost-then-longest match semantics, be sure to put the

84

# longest operators first (e.g., if = came before ==, == would get

85

# recognized as two instances of =).

86

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

87

r"//=?", r"->",

Benjamin Peterson

4ab92c8

2014-04-10 00:12:47 -0400

[diff] [blame]

88

r"[+\-*/%&@|^=<>]=?",

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

r"~")

Bracket = '[][(){}]'

Special = group(r'\r?\n', r'[:;.,`@]')

93

Funny = group(Operator, Bracket, Special)

94

95

PlainToken = group(Number, Funny, String, Name)

96

Token = Ignore + PlainToken

97

98

# First (or only) line of ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

99

ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

100

group("'", r'\\\r?\n'),

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

101

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

102

group('"', r'\\\r?\n'))

103

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

104

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

105

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

106

tokenprog, pseudoprog, single3prog, double3prog = list(map(

107

re.compile, (Token, PseudoToken, Single3, Double3)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

108

endprogs = {"'": re.compile(Single), '"': re.compile(Double),

109

"'''": single3prog, '"""': double3prog,

110

"r'''": single3prog, 'r"""': double3prog,

111

"u'''": single3prog, 'u"""': double3prog,

112

"b'''": single3prog, 'b"""': double3prog,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

113

"f'''": single3prog, 'f"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

114

"ur'''": single3prog, 'ur"""': double3prog,

115

"br'''": single3prog, 'br"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

116

"rb'''": single3prog, 'rb"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

117

"R'''": single3prog, 'R"""': double3prog,

118

"U'''": single3prog, 'U"""': double3prog,

119

"B'''": single3prog, 'B"""': double3prog,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

120

"F'''": single3prog, 'F"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

121

"uR'''": single3prog, 'uR"""': double3prog,

122

"Ur'''": single3prog, 'Ur"""': double3prog,

123

"UR'''": single3prog, 'UR"""': double3prog,

124

"bR'''": single3prog, 'bR"""': double3prog,

125

"Br'''": single3prog, 'Br"""': double3prog,

126

"BR'''": single3prog, 'BR"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

127

"rB'''": single3prog, 'rB"""': double3prog,

128

"Rb'''": single3prog, 'Rb"""': double3prog,

129

"RB'''": single3prog, 'RB"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

130

'r': None, 'R': None,

131

'u': None, 'U': None,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

132

'f': None, 'F': None,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

133

'b': None, 'B': None}

134

135

triple_quoted = {}

136

for t in ("'''", '"""',

137

"r'''", 'r"""', "R'''", 'R"""',

138

"u'''", 'u"""', "U'''", 'U"""',

139

"b'''", 'b"""', "B'''", 'B"""',

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

140

"f'''", 'f"""', "F'''", 'F"""',

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

141

"ur'''", 'ur"""', "Ur'''", 'Ur"""',

142

"uR'''", 'uR"""', "UR'''", 'UR"""',

143

"br'''", 'br"""', "Br'''", 'Br"""',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

144

"bR'''", 'bR"""', "BR'''", 'BR"""',

145

"rb'''", 'rb"""', "Rb'''", 'Rb"""',

146

"rB'''", 'rB"""', "RB'''", 'RB"""',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

151

"u'", 'u"', "U'", 'U"',

152

"b'", 'b"', "B'", 'B"',

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame]

153

"f'", 'f"', "F'", 'F"',

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

154

"ur'", 'ur"', "Ur'", 'Ur"',

155

"uR'", 'uR"', "UR'", 'UR"',

156

"br'", 'br"', "Br'", 'Br"',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

157

"bR'", 'bR"', "BR'", 'BR"',

158

"rb'", 'rb"', "Rb'", 'Rb"',

159

"rB'", 'rB"', "RB'", 'RB"',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

single_quoted[t] = t

tabsize = 8

class TokenError(Exception): pass

165

166

class StopTokenizing(Exception): pass

167

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

168

def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing

169

(srow, scol) = xxx_todo_changeme

170

(erow, ecol) = xxx_todo_changeme1

171

print("%d,%d-%d,%d:\t%s\t%s" % \

172

(srow, scol, erow, ecol, tok_name[type], repr(token)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

173

174

def tokenize(readline, tokeneater=printtoken):

175

"""

176

The tokenize() function accepts two parameters: one representing the

177

input stream, and one providing an output mechanism for tokenize().

178

179

The first parameter, readline, must be a callable object which provides

180

the same interface as the readline() method of built-in file objects.

181

Each call to the function should return one line of input as a string.

182

183

The second parameter, tokeneater, must also be a callable object. It is

184

called once for each token, with five arguments, corresponding to the

185

tuples generated by generate_tokens().

186

"""

187

try:

188

tokenize_loop(readline, tokeneater)

189

except StopTokenizing:

190

pass

191

192

# backwards compatible interface

193

def tokenize_loop(readline, tokeneater):

194

for token_info in generate_tokens(readline):

195

tokeneater(*token_info)

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

def add_whitespace(self, start):

205

row, col = start

206

assert row <= self.prev_row

207

col_offset = col - self.prev_col

208

if col_offset:

209

self.tokens.append(" " * col_offset)

210

211

def untokenize(self, iterable):

212

for t in iterable:

213

if len(t) == 2:

214

self.compat(t, iterable)

215

break

216

tok_type, token, start, end, line = t

217

self.add_whitespace(start)

218

self.tokens.append(token)

219

self.prev_row, self.prev_col = end

220

if tok_type in (NEWLINE, NL):

221

self.prev_row += 1

222

self.prev_col = 0

223

return "".join(self.tokens)

224

225

def compat(self, token, iterable):

226

startline = False

227

indents = []

228

toks_append = self.tokens.append

229

toknum, tokval = token

230

if toknum in (NAME, NUMBER):

231

tokval += ' '

232

if toknum in (NEWLINE, NL):

233

startline = True

234

for tok in iterable:

235

toknum, tokval = tok[:2]

236

Jelle Zijlstra

ac31770

2017-10-05 20:24:46 -0700

[diff] [blame^]

237

if toknum in (NAME, NUMBER):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

tokval += ' '

if toknum == INDENT:

indents.append(tokval)

242

continue

243

elif toknum == DEDENT:

244

indents.pop()

245

continue

246

elif toknum in (NEWLINE, NL):

247

startline = True

248

elif startline and indents:

249

toks_append(indents[-1])

startline = False

toks_append(tokval)

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

253

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

254

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

255

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

256

def _get_normal_name(orig_enc):

257

"""Imitates get_normal_name in tokenizer.c."""

258

# Only care about the first 12 characters.

259

enc = orig_enc[:12].lower().replace("_", "-")

260

if enc == "utf-8" or enc.startswith("utf-8-"):

261

return "utf-8"

262

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

263

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

267

def detect_encoding(readline):

268

"""

269

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

270

be used to decode a Python source file. It requires one argument, readline,

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

271

in the same way as the tokenize() generator.

272

273

It will call readline a maximum of twice, and return the encoding used

274

(as a string) and a list of any lines (left as bytes) it has read

275

in.

276

277

It detects the encoding from the presence of a utf-8 bom or an encoding

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

278

cookie as specified in pep-0263. If both a bom and a cookie are present, but

279

disagree, a SyntaxError will be raised. If the encoding cookie is an invalid

280

charset, raise a SyntaxError. Note that if a utf-8 bom is found,

281

'utf-8-sig' is returned.

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

282

283

If no encoding is specified, then the default of 'utf-8' will be returned.

284

"""

285

bom_found = False

286

encoding = None

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

287

default = 'utf-8'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

292

return bytes()

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

293

294

def find_cookie(line):

295

try:

296

line_string = line.decode('ascii')

297

except UnicodeDecodeError:

298

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

299

match = cookie_re.match(line_string)

300

if not match:

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

301

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

302

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

303

try:

304

codec = lookup(encoding)

305

except LookupError:

306

# This behaviour mimics the Python interpreter

307

raise SyntaxError("unknown encoding: " + encoding)

308

Benjamin Peterson

2021100

2009-11-25 18:34:42 +0000

[diff] [blame]

309

if bom_found:

310

if codec.name != 'utf-8':

311

# This behaviour mimics the Python interpreter

312

raise SyntaxError('encoding problem: utf-8')

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

313

encoding += '-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

314

return encoding

315

316

first = read_or_stop()

317

if first.startswith(BOM_UTF8):

318

bom_found = True

319

first = first[3:]

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

320

default = 'utf-8-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

321

if not first:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

322

return default, []

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

323

324

encoding = find_cookie(first)

325

if encoding:

326

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

327

if not blank_re.match(first):

328

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

329

330

second = read_or_stop()

331

if not second:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

332

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

333

334

encoding = find_cookie(second)

335

if encoding:

336

return encoding, [first, second]

337

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

338

return default, [first, second]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

339

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

340

def untokenize(iterable):

341

"""Transform tokens back into Python source code.

342

343

Each element returned by the iterable must be a token sequence

344

with at least two elements, a token number and token value. If

345

only two tokens are passed, the resulting output is poor.

346

347

Round-trip invariant for full input:

348

Untokenized source will match input source exactly

349

350

Round-trip invariant for limited intput:

351

# Output text will tokenize the back to the input

352

t1 = [tok[:2] for tok in generate_tokens(f.readline)]

353

newcode = untokenize(t1)

354

readline = iter(newcode.splitlines(1)).next

355

t2 = [tok[:2] for tokin generate_tokens(readline)]

assert t1 == t2

"""

ut = Untokenizer()

return ut.untokenize(iterable)

360

361

def generate_tokens(readline):

362

"""

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

363

The generate_tokens() generator requires one argument, readline, which

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

364

must be a callable object which provides the same interface as the

365

readline() method of built-in file objects. Each call to the function

366

should return one line of input as a string. Alternately, readline

367

can be a callable function terminating with StopIteration:

368

readline = open(myfile).next # Example of alternate readline

369

370

The generator produces 5-tuples with these members: the token type; the

371

token string; a 2-tuple (srow, scol) of ints specifying the row and

372

column where the token begins in the source; a 2-tuple (erow, ecol) of

373

ints specifying the row and column where the token ends in the source;

374

and the line on which the token was found. The line passed is the

375

logical line; continuation lines are included.

376

"""

377

lnum = parenlev = continued = 0

378

namechars, numchars = string.ascii_letters + '_', '0123456789'

379

contstr, needcont = '', 0

contline = None

indents = [0]

while 1: # loop over lines in stream

384

try:

385

line = readline()

386

except StopIteration:

387

line = ''

388

lnum = lnum + 1

389

pos, max = 0, len(line)

390

391

if contstr: # continued string

392

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

393

raise TokenError("EOF in multi-line string", strstart)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

394

endmatch = endprog.match(line)

395

if endmatch:

396

pos = end = endmatch.end(0)

397

yield (STRING, contstr + line[:end],

398

strstart, (lnum, end), contline + line)

399

contstr, needcont = '', 0

400

contline = None

401

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

402

yield (ERRORTOKEN, contstr + line,

403

strstart, (lnum, len(line)), contline)

contstr = ''

contline = None

continue

else:

contstr = contstr + line

409

contline = contline + line

410

continue

411

412

elif parenlev == 0 and not continued: # new statement

413

if not line: break

414

column = 0

415

while pos < max: # measure leading whitespace

416

if line[pos] == ' ': column = column + 1

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

417

elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

418

elif line[pos] == '\f': column = 0

else: break

pos = pos + 1

if pos == max: break

if line[pos] in '#\r\n': # skip comments or blank lines

424

if line[pos] == '#':

425

comment_token = line[pos:].rstrip('\r\n')

426

nl_pos = pos + len(comment_token)

427

yield (COMMENT, comment_token,

428

(lnum, pos), (lnum, pos + len(comment_token)), line)

429

yield (NL, line[nl_pos:],

430

(lnum, nl_pos), (lnum, len(line)), line)

431

else:

432

yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],

433

(lnum, pos), (lnum, len(line)), line)

434

continue

435

436

if column > indents[-1]: # count indents or dedents

437

indents.append(column)

438

yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

439

while column < indents[-1]:

440

if column not in indents:

441

raise IndentationError(

442

"unindent does not match any outer indentation level",

443

("<tokenize>", lnum, pos, line))

444

indents = indents[:-1]

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

445

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

446

yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

447

448

else: # continued statement

449

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

450

raise TokenError("EOF in multi-line statement", (lnum, 0))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

continued = 0

while pos < max:

pseudomatch = pseudoprog.match(line, pos)

455

if pseudomatch: # scan for tokens

456

start, end = pseudomatch.span(1)

457

spos, epos, pos = (lnum, start), (lnum, end), end

458

token, initial = line[start:end], line[start]

459

460

if initial in numchars or \

461

(initial == '.' and token != '.'): # ordinary number

462

yield (NUMBER, token, spos, epos, line)

463

elif initial in '\r\n':

newline = NEWLINE

if parenlev > 0:

newline = NL

yield (newline, token, spos, epos, line)

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

468

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

469

elif initial == '#':

470

assert not token.endswith("\n")

471

yield (COMMENT, token, spos, epos, line)

472

elif token in triple_quoted:

473

endprog = endprogs[token]

474

endmatch = endprog.match(line, pos)

475

if endmatch: # all on one line

476

pos = endmatch.end(0)

477

token = line[start:pos]

478

yield (STRING, token, spos, (lnum, pos), line)

479

else:

480

strstart = (lnum, start) # multiple lines

481

contstr = line[start:]

482

contline = line

483

break

484

elif initial in single_quoted or \

485

token[:2] in single_quoted or \

486

token[:3] in single_quoted:

487

if token[-1] == '\n': # continued string

488

strstart = (lnum, start)

489

endprog = (endprogs[initial] or endprogs[token[1]] or

490

endprogs[token[2]])

491

contstr, needcont = line[start:], 1

492

contline = line

493

break

494

else: # ordinary string

495

yield (STRING, token, spos, epos, line)

496

elif initial in namechars: # ordinary name

Jelle Zijlstra

ac31770

2017-10-05 20:24:46 -0700

[diff] [blame^]

497

yield (NAME, token, spos, epos, line)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

498

elif initial == '\\': # continued stmt

499

# This yield is new; needed for better idempotency:

500

yield (NL, token, spos, (lnum, pos), line)

501

continued = 1

502

else:

503

if initial in '([{': parenlev = parenlev + 1

504

elif initial in ')]}': parenlev = parenlev - 1

505

yield (OP, token, spos, epos, line)

506

else:

507

yield (ERRORTOKEN, line[pos],

508

(lnum, pos), (lnum, pos+1), line)

509

pos = pos + 1

510

Martin v. Löwis