Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

2008-03-19 05:04:44 +0000

[diff] [blame]

"""Tokenization help for Python programs.

5

6

generate_tokens(readline) is a generator that breaks a stream of

7

text into Python tokens. It accepts a readline-like method which is called

8

repeatedly to get the next line of input (or "" for EOF). It generates

9

5-tuples with these members:

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

operators

Older entry points

tokenize_loop(readline, tokeneater)

23

tokenize(readline, tokeneater=printtoken)

24

are the same, except instead of generating tokens, tokeneater is a callback

25

function to which the 5 fields described above are passed as 5 arguments,

26

each time a new token is found."""

27

28

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

29

__credits__ = \

30

'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

31

32

import string, re

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

33

from codecs import BOM_UTF8, lookup

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

34

from lib2to3.pgen2.token import *

35

36

from . import token

37

__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",

38

"generate_tokens", "untokenize"]

39

del token

40

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

try:

bytes

except NameError:

# Support bytes type in Python <= 2.5, so 2to3 turns itself into

45

# valid Python 3 code.

46

bytes = str

47

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

48

def group(*choices): return '(' + '|'.join(choices) + ')'

49

def any(*choices): return group(*choices) + '*'

50

def maybe(*choices): return group(*choices) + '?'

Zsolt Dollenstein

2018-04-16 22:33:31 +0100

[diff] [blame]

51

def _combinations(*l):

52

return set(

53

x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()

54

)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

55

56

Whitespace = r'[ \f\t]*'

57

Comment = r'#[^\r\n]*'

58

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

59

Name = r'[a-zA-Z_]\w*'

60

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

61

Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'

62

Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'

63

Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'

64

Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

65

Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

66

Exponent = r'[eE][-+]?\d+(?:_\d+)*'

67

Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)

68

Expfloat = r'\d+(?:_\d+)*' + Exponent

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

69

Floatnumber = group(Pointfloat, Expfloat)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

70

Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

71

Number = group(Imagnumber, Floatnumber, Intnumber)

72

73

# Tail end of ' string.

74

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

75

# Tail end of " string.

76

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

77

# Tail end of ''' string.

78

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

79

# Tail end of """ string.

80

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Zsolt Dollenstein

2018-04-16 22:33:31 +0100

[diff] [blame]

81

_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

82

Triple = group(_litprefix + "'''", _litprefix + '"""')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

83

# Single-line ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

84

String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

85

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

86

87

# Because of leftmost-then-longest match semantics, be sure to put the

88

# longest operators first (e.g., if = came before ==, == would get

89

# recognized as two instances of =).

90

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

91

r"//=?", r"->",

Benjamin Peterson

4ab92c8

2014-04-10 00:12:47 -0400

[diff] [blame]

92

r"[+\-*/%&@|^=<>]=?",

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

r"~")

Bracket = '[][(){}]'

Special = group(r'\r?\n', r'[:;.,`@]')

97

Funny = group(Operator, Bracket, Special)

98

99

PlainToken = group(Number, Funny, String, Name)

100

Token = Ignore + PlainToken

101

102

# First (or only) line of ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

103

ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

104

group("'", r'\\\r?\n'),

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

105

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

106

group('"', r'\\\r?\n'))

107

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

108

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

109

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

110

tokenprog, pseudoprog, single3prog, double3prog = list(map(

111

re.compile, (Token, PseudoToken, Single3, Double3)))

Zsolt Dollenstein

2018-04-16 22:33:31 +0100

[diff] [blame]

112

113

_strprefixes = (

114

_combinations('r', 'R', 'f', 'F') |

115

_combinations('r', 'R', 'b', 'B') |

116

{'u', 'U', 'ur', 'uR', 'Ur', 'UR'}

117

)

118

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

119

endprogs = {"'": re.compile(Single), '"': re.compile(Double),

120

"'''": single3prog, '"""': double3prog,

Zsolt Dollenstein

2018-04-16 22:33:31 +0100

[diff] [blame]

121

**{f"{prefix}'''": single3prog for prefix in _strprefixes},

122

**{f'{prefix}"""': double3prog for prefix in _strprefixes},

123

**{prefix: None for prefix in _strprefixes}}

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

124

Zsolt Dollenstein

2018-04-16 22:33:31 +0100

[diff] [blame]

125

triple_quoted = (

126

{"'''", '"""'} |

127

{f"{prefix}'''" for prefix in _strprefixes} |

128

{f'{prefix}"""' for prefix in _strprefixes}

)

single_quoted = (

{"'", '"'} |

{f"{prefix}'" for prefix in _strprefixes} |

133

{f'{prefix}"' for prefix in _strprefixes}

134

)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

tabsize = 8

class TokenError(Exception): pass

139

140

class StopTokenizing(Exception): pass

141

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

142

def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing

143

(srow, scol) = xxx_todo_changeme

144

(erow, ecol) = xxx_todo_changeme1

145

print("%d,%d-%d,%d:\t%s\t%s" % \

146

(srow, scol, erow, ecol, tok_name[type], repr(token)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

147

148

def tokenize(readline, tokeneater=printtoken):

149

"""

150

The tokenize() function accepts two parameters: one representing the

151

input stream, and one providing an output mechanism for tokenize().

152

153

The first parameter, readline, must be a callable object which provides

154

the same interface as the readline() method of built-in file objects.

155

Each call to the function should return one line of input as a string.

156

157

The second parameter, tokeneater, must also be a callable object. It is

158

called once for each token, with five arguments, corresponding to the

159

tuples generated by generate_tokens().

160

"""

161

try:

162

tokenize_loop(readline, tokeneater)

163

except StopTokenizing:

164

pass

165

166

# backwards compatible interface

167

def tokenize_loop(readline, tokeneater):

168

for token_info in generate_tokens(readline):

169

tokeneater(*token_info)

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

def add_whitespace(self, start):

179

row, col = start

180

assert row <= self.prev_row

181

col_offset = col - self.prev_col

182

if col_offset:

183

self.tokens.append(" " * col_offset)

184

185

def untokenize(self, iterable):

186

for t in iterable:

187

if len(t) == 2:

188

self.compat(t, iterable)

189

break

190

tok_type, token, start, end, line = t

191

self.add_whitespace(start)

192

self.tokens.append(token)

193

self.prev_row, self.prev_col = end

194

if tok_type in (NEWLINE, NL):

195

self.prev_row += 1

196

self.prev_col = 0

197

return "".join(self.tokens)

198

199

def compat(self, token, iterable):

200

startline = False

201

indents = []

202

toks_append = self.tokens.append

203

toknum, tokval = token

204

if toknum in (NAME, NUMBER):

205

tokval += ' '

206

if toknum in (NEWLINE, NL):

207

startline = True

208

for tok in iterable:

209

toknum, tokval = tok[:2]

210

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

211

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

tokval += ' '

if toknum == INDENT:

indents.append(tokval)

216

continue

217

elif toknum == DEDENT:

218

indents.pop()

219

continue

220

elif toknum in (NEWLINE, NL):

221

startline = True

222

elif startline and indents:

223

toks_append(indents[-1])

startline = False

toks_append(tokval)

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

227

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

228

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

229

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

230

def _get_normal_name(orig_enc):

231

"""Imitates get_normal_name in tokenizer.c."""

232

# Only care about the first 12 characters.

233

enc = orig_enc[:12].lower().replace("_", "-")

234

if enc == "utf-8" or enc.startswith("utf-8-"):

235

return "utf-8"

236

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

237

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

241

def detect_encoding(readline):

242

"""

243

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

244

be used to decode a Python source file. It requires one argument, readline,

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

245

in the same way as the tokenize() generator.

246

247

It will call readline a maximum of twice, and return the encoding used

248

(as a string) and a list of any lines (left as bytes) it has read

249

in.

250

251

It detects the encoding from the presence of a utf-8 bom or an encoding

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

252

cookie as specified in pep-0263. If both a bom and a cookie are present, but

253

disagree, a SyntaxError will be raised. If the encoding cookie is an invalid

254

charset, raise a SyntaxError. Note that if a utf-8 bom is found,

255

'utf-8-sig' is returned.

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

256

257

If no encoding is specified, then the default of 'utf-8' will be returned.

258

"""

259

bom_found = False

260

encoding = None

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

261

default = 'utf-8'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

266

return bytes()

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

267

268

def find_cookie(line):

269

try:

270

line_string = line.decode('ascii')

271

except UnicodeDecodeError:

272

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

273

match = cookie_re.match(line_string)

274

if not match:

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

275

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

276

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

277

try:

278

codec = lookup(encoding)

279

except LookupError:

280

# This behaviour mimics the Python interpreter

281

raise SyntaxError("unknown encoding: " + encoding)

282

Benjamin Peterson

2021100

2009-11-25 18:34:42 +0000

[diff] [blame]

283

if bom_found:

284

if codec.name != 'utf-8':

285

# This behaviour mimics the Python interpreter

286

raise SyntaxError('encoding problem: utf-8')

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

287

encoding += '-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

288

return encoding

289

290

first = read_or_stop()

291

if first.startswith(BOM_UTF8):

292

bom_found = True

293

first = first[3:]

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

294

default = 'utf-8-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

295

if not first:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

296

return default, []

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

297

298

encoding = find_cookie(first)

299

if encoding:

300

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

301

if not blank_re.match(first):

302

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

303

304

second = read_or_stop()

305

if not second:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

306

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

307

308

encoding = find_cookie(second)

309

if encoding:

310

return encoding, [first, second]

311

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

312

return default, [first, second]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

313

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

314

def untokenize(iterable):

315

"""Transform tokens back into Python source code.

316

317

Each element returned by the iterable must be a token sequence

318

with at least two elements, a token number and token value. If

319

only two tokens are passed, the resulting output is poor.

320

321

Round-trip invariant for full input:

322

Untokenized source will match input source exactly

323

324

Round-trip invariant for limited intput:

325

# Output text will tokenize the back to the input

326

t1 = [tok[:2] for tok in generate_tokens(f.readline)]

327

newcode = untokenize(t1)

328

readline = iter(newcode.splitlines(1)).next

329

t2 = [tok[:2] for tokin generate_tokens(readline)]

assert t1 == t2

"""

ut = Untokenizer()

return ut.untokenize(iterable)

334

335

def generate_tokens(readline):

336

"""

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

337

The generate_tokens() generator requires one argument, readline, which

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

338

must be a callable object which provides the same interface as the

339

readline() method of built-in file objects. Each call to the function

340

should return one line of input as a string. Alternately, readline

341

can be a callable function terminating with StopIteration:

342

readline = open(myfile).next # Example of alternate readline

343

344

The generator produces 5-tuples with these members: the token type; the

345

token string; a 2-tuple (srow, scol) of ints specifying the row and

346

column where the token begins in the source; a 2-tuple (erow, ecol) of

347

ints specifying the row and column where the token ends in the source;

348

and the line on which the token was found. The line passed is the

349

logical line; continuation lines are included.

350

"""

351

lnum = parenlev = continued = 0

352

namechars, numchars = string.ascii_letters + '_', '0123456789'

353

contstr, needcont = '', 0

contline = None

indents = [0]

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

357

# 'stashed' and 'async_*' are used for async/await parsing

stashed = None

async_def = False

async_def_indent = 0

async_def_nl = False

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

363

while 1: # loop over lines in stream

364

try:

365

line = readline()

366

except StopIteration:

367

line = ''

368

lnum = lnum + 1

369

pos, max = 0, len(line)

370

371

if contstr: # continued string

372

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

373

raise TokenError("EOF in multi-line string", strstart)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

374

endmatch = endprog.match(line)

375

if endmatch:

376

pos = end = endmatch.end(0)

377

yield (STRING, contstr + line[:end],

378

strstart, (lnum, end), contline + line)

379

contstr, needcont = '', 0

380

contline = None

381

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

382

yield (ERRORTOKEN, contstr + line,

383

strstart, (lnum, len(line)), contline)

contstr = ''

contline = None

continue

else:

contstr = contstr + line

389

contline = contline + line

390

continue

391

392

elif parenlev == 0 and not continued: # new statement

393

if not line: break

394

column = 0

395

while pos < max: # measure leading whitespace

396

if line[pos] == ' ': column = column + 1

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

397

elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

398

elif line[pos] == '\f': column = 0

else: break

pos = pos + 1

if pos == max: break

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

if stashed:

yield stashed

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

407

if line[pos] in '#\r\n': # skip comments or blank lines

408

if line[pos] == '#':

409

comment_token = line[pos:].rstrip('\r\n')

410

nl_pos = pos + len(comment_token)

411

yield (COMMENT, comment_token,

412

(lnum, pos), (lnum, pos + len(comment_token)), line)

413

yield (NL, line[nl_pos:],

414

(lnum, nl_pos), (lnum, len(line)), line)

415

else:

416

yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],

417

(lnum, pos), (lnum, len(line)), line)

418

continue

419

420

if column > indents[-1]: # count indents or dedents

421

indents.append(column)

422

yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

423

while column < indents[-1]:

424

if column not in indents:

425

raise IndentationError(

426

"unindent does not match any outer indentation level",

427

("<tokenize>", lnum, pos, line))

428

indents = indents[:-1]

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

429

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

430

if async_def and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

435

yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

436

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

437

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

442

else: # continued statement

443

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

444

raise TokenError("EOF in multi-line statement", (lnum, 0))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

continued = 0

while pos < max:

pseudomatch = pseudoprog.match(line, pos)

449

if pseudomatch: # scan for tokens

450

start, end = pseudomatch.span(1)

451

spos, epos, pos = (lnum, start), (lnum, end), end

452

token, initial = line[start:end], line[start]

453

454

if initial in numchars or \

455

(initial == '.' and token != '.'): # ordinary number

456

yield (NUMBER, token, spos, epos, line)

457

elif initial in '\r\n':

458

newline = NEWLINE

459

if parenlev > 0:

460

newline = NL

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

elif async_def:

async_def_nl = True

if stashed:

yield stashed

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

466

yield (newline, token, spos, epos, line)

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

467

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

468

elif initial == '#':

469

assert not token.endswith("\n")

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

470

if stashed:

471

yield stashed

472

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

473

yield (COMMENT, token, spos, epos, line)

474

elif token in triple_quoted:

475

endprog = endprogs[token]

476

endmatch = endprog.match(line, pos)

477

if endmatch: # all on one line

478

pos = endmatch.end(0)

479

token = line[start:pos]

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

480

if stashed:

481

yield stashed

482

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

483

yield (STRING, token, spos, (lnum, pos), line)

484

else:

485

strstart = (lnum, start) # multiple lines

486

contstr = line[start:]

487

contline = line

488

break

489

elif initial in single_quoted or \

490

token[:2] in single_quoted or \

491

token[:3] in single_quoted:

492

if token[-1] == '\n': # continued string

493

strstart = (lnum, start)

494

endprog = (endprogs[initial] or endprogs[token[1]] or

495

endprogs[token[2]])

496

contstr, needcont = line[start:], 1

497

contline = line

498

break

499

else: # ordinary string

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

500

if stashed:

501

yield stashed

502

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

503

yield (STRING, token, spos, epos, line)

504

elif initial in namechars: # ordinary name

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

505

if token in ('async', 'await'):

506

if async_def:

507

yield (ASYNC if token == 'async' else AWAIT,

508

token, spos, epos, line)

509

continue

510

511

tok = (NAME, token, spos, epos, line)

512

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed[0] == NAME

519

and stashed[1] == 'async'):

520

521

async_def = True

522

async_def_indent = indents[-1]

523

524

yield (ASYNC, stashed[1],

525

stashed[2], stashed[3],

stashed[4])

stashed = None

if stashed:

yield stashed

stashed = None

yield tok

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

534

elif initial == '\\': # continued stmt

535

# This yield is new; needed for better idempotency:

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

536

if stashed:

537

yield stashed

538

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

539

yield (NL, token, spos, (lnum, pos), line)

540

continued = 1

541

else:

542

if initial in '([{': parenlev = parenlev + 1

543

elif initial in ')]}': parenlev = parenlev - 1

Jelle Zijlstra

2018-03-18 09:54:33 -0700

[diff] [blame]

544

if stashed:

545

yield stashed

546

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

547

yield (OP, token, spos, epos, line)

548

else:

549

yield (ERRORTOKEN, line[pos],

550

(lnum, pos), (lnum, pos+1), line)

551

pos = pos + 1

552

Jelle Zijlstra