Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-16 00:07:10 +0000

[diff] [blame]

1

doctests = """

2

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

3

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

4

The tests can be really simple. Given a small fragment of source

Terry Jan Reedy

2014-02-23 23:33:08 -0500

[diff] [blame]

5

code, print out a table with tokens. The ENDMARKER is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

6

brevity.

7

Jason R. Coombs

7cf3638

2015-06-20 19:13:50 -0400

[diff] [blame]

8

>>> import glob

9

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

11

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

12

NUMBER '1' (1, 0) (1, 1)

13

OP '+' (1, 2) (1, 3)

14

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

15

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

16

>>> dump_tokens("if False:\\n"

17

... " # NL\\n"

18

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

19

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

20

NAME 'if' (1, 0) (1, 2)

21

NAME 'False' (1, 3) (1, 8)

22

OP ':' (1, 8) (1, 9)

23

NEWLINE '\\n' (1, 9) (1, 10)

24

COMMENT '# NL' (2, 4) (2, 8)

25

NL '\\n' (2, 8) (2, 9)

26

INDENT ' ' (3, 0) (3, 4)

27

NAME 'True' (3, 4) (3, 8)

28

OP '=' (3, 9) (3, 10)

29

NAME 'False' (3, 11) (3, 16)

30

COMMENT '# NEWLINE' (3, 17) (3, 26)

31

NEWLINE '\\n' (3, 26) (3, 27)

32

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

33

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

34

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

39

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

40

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

41

Traceback (most recent call last):

42

...

43

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Mark Dickinson

3c0b317

2010-06-29 07:38:37 +0000

[diff] [blame]

45

There are some standard formatting practices that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

46

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

47

>>> roundtrip("if x == 1:\\n"

48

... " print(x)\\n")

49

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

50

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

51

>>> roundtrip("# This is a comment\\n# This also")

52

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

53

54

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

55

untokenize a little trickier. Note that this test involves trailing

56

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

57

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

58

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

59

>>> roundtrip("if x == 1 : \\n"

60

... " print(x)\\n")

61

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

62

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

63

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

64

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

66

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

67

>>> roundtrip("if x == 1:\\n"

68

... " # A comment by itself.\\n"

69

... " print(x) # Comment here, too.\\n"

70

... " # Another comment.\\n"

71

... "after_if = True\\n")

72

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

73

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

74

>>> roundtrip("if (x # The comments need to go in the right place\\n"

75

... " == 1):\\n"

76

... " print('x==1')\\n")

77

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

78

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

79

>>> roundtrip("class Test: # A comment here\\n"

80

... " # A comment with weird indent\\n"

81

... " after_com = 5\\n"

82

... " def x(m): return m*5 # a one liner\\n"

83

... " def y(m): # A whitespace after the colon\\n"

84

... " return y*4 # 3-space indent\\n")

85

True

86

87

Some error-handling code

88

89

>>> roundtrip("try: import somemodule\\n"

90

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

91

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

92

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

93

True

94

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

95

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

96

97

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

102

... "'b':15, 'c':True}\\n"

103

... "x = len(y) + 5 - a[\\n"

104

... "3] - a[2]\\n"

105

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

110

111

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

112

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

113

NUMBER '0xff' (1, 0) (1, 4)

114

OP '<=' (1, 5) (1, 7)

115

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

117

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

118

NUMBER '0b10' (1, 0) (1, 4)

119

OP '<=' (1, 5) (1, 7)

120

NUMBER '255' (1, 8) (1, 11)

121

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

122

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

123

NUMBER '0o123' (1, 0) (1, 5)

124

OP '<=' (1, 6) (1, 8)

125

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

127

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

128

NUMBER '1234567' (1, 0) (1, 7)

129

OP '>' (1, 8) (1, 9)

130

OP '~' (1, 10) (1, 11)

131

NUMBER '0x15' (1, 11) (1, 15)

132

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

133

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

134

NUMBER '2134568' (1, 0) (1, 7)

135

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

136

NUMBER '1231515' (1, 11) (1, 18)

137

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

138

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

139

OP '(' (1, 0) (1, 1)

140

OP '-' (1, 1) (1, 2)

141

NUMBER '124561' (1, 2) (1, 8)

142

OP '-' (1, 8) (1, 9)

143

NUMBER '1' (1, 9) (1, 10)

144

OP ')' (1, 10) (1, 11)

145

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

146

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

148

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

149

NUMBER '0xdeadbeef' (1, 0) (1, 10)

150

OP '!=' (1, 11) (1, 13)

151

OP '-' (1, 14) (1, 15)

152

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

153

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

154

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

155

NUMBER '0xdeadc0de' (1, 0) (1, 10)

156

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

157

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

159

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

160

NUMBER '0xFF' (1, 0) (1, 4)

161

OP '&' (1, 5) (1, 6)

162

NUMBER '0x15' (1, 7) (1, 11)

163

OP '|' (1, 12) (1, 13)

164

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

168

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

169

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

170

NAME 'x' (1, 0) (1, 1)

171

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

172

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

174

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

175

NAME 'x' (1, 0) (1, 1)

176

OP '=' (1, 2) (1, 3)

177

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

178

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

179

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

180

NAME 'x' (1, 0) (1, 1)

181

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

182

NUMBER '123141242151 (1, 4) (1, 25)

183

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

184

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

185

NAME 'x' (1, 0) (1, 1)

186

OP '=' (1, 2) (1, 3)

187

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

188

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

189

190

Floating point numbers

191

192

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

193

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

194

NAME 'x' (1, 0) (1, 1)

195

OP '=' (1, 2) (1, 3)

196

NUMBER '3.14159' (1, 4) (1, 11)

197

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

198

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

199

NAME 'x' (1, 0) (1, 1)

200

OP '=' (1, 2) (1, 3)

201

NUMBER '314159.' (1, 4) (1, 11)

202

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

203

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

204

NAME 'x' (1, 0) (1, 1)

205

OP '=' (1, 2) (1, 3)

206

NUMBER '.314159' (1, 4) (1, 11)

207

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

208

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

209

NAME 'x' (1, 0) (1, 1)

210

OP '=' (1, 2) (1, 3)

211

NUMBER '3e14159' (1, 4) (1, 11)

212

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

213

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

214

NAME 'x' (1, 0) (1, 1)

215

OP '=' (1, 2) (1, 3)

216

NUMBER '3E123' (1, 4) (1, 9)

217

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

218

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

219

NAME 'x' (1, 0) (1, 1)

220

OP '+' (1, 1) (1, 2)

221

NAME 'y' (1, 2) (1, 3)

222

OP '=' (1, 4) (1, 5)

223

NUMBER '3e-1230' (1, 6) (1, 13)

224

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

225

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

226

NAME 'x' (1, 0) (1, 1)

227

OP '=' (1, 2) (1, 3)

228

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

233

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

234

NAME 'x' (1, 0) (1, 1)

235

OP '=' (1, 2) (1, 3)

236

STRING "''" (1, 4) (1, 6)

237

OP ';' (1, 6) (1, 7)

238

NAME 'y' (1, 8) (1, 9)

239

OP '=' (1, 10) (1, 11)

240

STRING '""' (1, 12) (1, 14)

241

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

242

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

243

NAME 'x' (1, 0) (1, 1)

244

OP '=' (1, 2) (1, 3)

245

STRING '\\'"\\'' (1, 4) (1, 7)

246

OP ';' (1, 7) (1, 8)

247

NAME 'y' (1, 9) (1, 10)

248

OP '=' (1, 11) (1, 12)

249

STRING '"\\'"' (1, 13) (1, 16)

250

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

252

NAME 'x' (1, 0) (1, 1)

253

OP '=' (1, 2) (1, 3)

254

STRING '"doesn\\'t "' (1, 4) (1, 14)

255

NAME 'shrink' (1, 14) (1, 20)

256

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

257

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

258

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

259

NAME 'x' (1, 0) (1, 1)

260

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

261

STRING "'abc'" (1, 4) (1, 9)

262

OP '+' (1, 10) (1, 11)

263

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

265

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

266

NAME 'y' (1, 0) (1, 1)

267

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

268

STRING '"ABC"' (1, 4) (1, 9)

269

OP '+' (1, 10) (1, 11)

270

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

272

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

273

NAME 'x' (1, 0) (1, 1)

274

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

275

STRING "r'abc'" (1, 4) (1, 10)

276

OP '+' (1, 11) (1, 12)

277

STRING "r'ABC'" (1, 13) (1, 19)

278

OP '+' (1, 20) (1, 21)

279

STRING "R'ABC'" (1, 22) (1, 28)

280

OP '+' (1, 29) (1, 30)

281

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

284

NAME 'y' (1, 0) (1, 1)

285

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

286

STRING 'r"abc"' (1, 4) (1, 10)

287

OP '+' (1, 11) (1, 12)

288

STRING 'r"ABC"' (1, 13) (1, 19)

289

OP '+' (1, 20) (1, 21)

290

STRING 'R"ABC"' (1, 22) (1, 28)

291

OP '+' (1, 29) (1, 30)

292

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

293

Meador Inge

8d5c0b8

2012-06-16 21:49:08 -0500

[diff] [blame]

294

>>> dump_tokens("u'abc' + U'abc'")

295

ENCODING 'utf-8' (0, 0) (0, 0)

296

STRING "u'abc'" (1, 0) (1, 6)

297

OP '+' (1, 7) (1, 8)

298

STRING "U'abc'" (1, 9) (1, 15)

299

>>> dump_tokens('u"abc" + U"abc"')

300

ENCODING 'utf-8' (0, 0) (0, 0)

301

STRING 'u"abc"' (1, 0) (1, 6)

302

OP '+' (1, 7) (1, 8)

303

STRING 'U"abc"' (1, 9) (1, 15)

Meador Inge

8d5c0b8

2012-06-16 21:49:08 -0500

[diff] [blame]

304

305

>>> dump_tokens("b'abc' + B'abc'")

306

ENCODING 'utf-8' (0, 0) (0, 0)

307

STRING "b'abc'" (1, 0) (1, 6)

308

OP '+' (1, 7) (1, 8)

309

STRING "B'abc'" (1, 9) (1, 15)

310

>>> dump_tokens('b"abc" + B"abc"')

311

ENCODING 'utf-8' (0, 0) (0, 0)

312

STRING 'b"abc"' (1, 0) (1, 6)

313

OP '+' (1, 7) (1, 8)

314

STRING 'B"abc"' (1, 9) (1, 15)

315

>>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")

316

ENCODING 'utf-8' (0, 0) (0, 0)

317

STRING "br'abc'" (1, 0) (1, 7)

318

OP '+' (1, 8) (1, 9)

319

STRING "bR'abc'" (1, 10) (1, 17)

320

OP '+' (1, 18) (1, 19)

321

STRING "Br'abc'" (1, 20) (1, 27)

322

OP '+' (1, 28) (1, 29)

323

STRING "BR'abc'" (1, 30) (1, 37)

324

>>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')

325

ENCODING 'utf-8' (0, 0) (0, 0)

326

STRING 'br"abc"' (1, 0) (1, 7)

327

OP '+' (1, 8) (1, 9)

328

STRING 'bR"abc"' (1, 10) (1, 17)

329

OP '+' (1, 18) (1, 19)

330

STRING 'Br"abc"' (1, 20) (1, 27)

331

OP '+' (1, 28) (1, 29)

332

STRING 'BR"abc"' (1, 30) (1, 37)

333

>>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")

334

ENCODING 'utf-8' (0, 0) (0, 0)

335

STRING "rb'abc'" (1, 0) (1, 7)

336

OP '+' (1, 8) (1, 9)

337

STRING "rB'abc'" (1, 10) (1, 17)

338

OP '+' (1, 18) (1, 19)

339

STRING "Rb'abc'" (1, 20) (1, 27)

340

OP '+' (1, 28) (1, 29)

341

STRING "RB'abc'" (1, 30) (1, 37)

342

>>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')

343

ENCODING 'utf-8' (0, 0) (0, 0)

344

STRING 'rb"abc"' (1, 0) (1, 7)

345

OP '+' (1, 8) (1, 9)

346

STRING 'rB"abc"' (1, 10) (1, 17)

347

OP '+' (1, 18) (1, 19)

348

STRING 'Rb"abc"' (1, 20) (1, 27)

349

OP '+' (1, 28) (1, 29)

350

STRING 'RB"abc"' (1, 30) (1, 37)

351

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

352

Operators

353

354

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

355

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

356

NAME 'def' (1, 0) (1, 3)

357

NAME 'd22' (1, 4) (1, 7)

358

OP '(' (1, 7) (1, 8)

359

NAME 'a' (1, 8) (1, 9)

360

OP ',' (1, 9) (1, 10)

361

NAME 'b' (1, 11) (1, 12)

362

OP ',' (1, 12) (1, 13)

363

NAME 'c' (1, 14) (1, 15)

364

OP '=' (1, 15) (1, 16)

365

NUMBER '2' (1, 16) (1, 17)

366

OP ',' (1, 17) (1, 18)

367

NAME 'd' (1, 19) (1, 20)

368

OP '=' (1, 20) (1, 21)

369

NUMBER '2' (1, 21) (1, 22)

370

OP ',' (1, 22) (1, 23)

371

OP '*' (1, 24) (1, 25)

372

NAME 'k' (1, 25) (1, 26)

373

OP ')' (1, 26) (1, 27)

374

OP ':' (1, 27) (1, 28)

375

NAME 'pass' (1, 29) (1, 33)

376

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

378

NAME 'def' (1, 0) (1, 3)

379

NAME 'd01v_' (1, 4) (1, 9)

380

OP '(' (1, 9) (1, 10)

381

NAME 'a' (1, 10) (1, 11)

382

OP '=' (1, 11) (1, 12)

383

NUMBER '1' (1, 12) (1, 13)

384

OP ',' (1, 13) (1, 14)

385

OP '*' (1, 15) (1, 16)

386

NAME 'k' (1, 16) (1, 17)

387

OP ',' (1, 17) (1, 18)

388

OP '**' (1, 19) (1, 21)

389

NAME 'w' (1, 21) (1, 22)

390

OP ')' (1, 22) (1, 23)

391

OP ':' (1, 23) (1, 24)

392

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

397

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

398

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

399

NAME 'if' (1, 0) (1, 2)

400

NUMBER '1' (1, 3) (1, 4)

401

OP '<' (1, 5) (1, 6)

402

NUMBER '1' (1, 7) (1, 8)

403

OP '>' (1, 9) (1, 10)

404

NUMBER '1' (1, 11) (1, 12)

405

OP '==' (1, 13) (1, 15)

406

NUMBER '1' (1, 16) (1, 17)

407

OP '>=' (1, 18) (1, 20)

408

NUMBER '5' (1, 21) (1, 22)

409

OP '<=' (1, 23) (1, 25)

410

NUMBER '0x15' (1, 26) (1, 30)

411

OP '<=' (1, 31) (1, 33)

412

NUMBER '0x12' (1, 34) (1, 38)

413

OP '!=' (1, 39) (1, 41)

414

NUMBER '1' (1, 42) (1, 43)

415

NAME 'and' (1, 44) (1, 47)

416

NUMBER '5' (1, 48) (1, 49)

417

NAME 'in' (1, 50) (1, 52)

418

NUMBER '1' (1, 53) (1, 54)

419

NAME 'not' (1, 55) (1, 58)

420

NAME 'in' (1, 59) (1, 61)

421

NUMBER '1' (1, 62) (1, 63)

422

NAME 'is' (1, 64) (1, 66)

423

NUMBER '1' (1, 67) (1, 68)

424

NAME 'or' (1, 69) (1, 71)

425

NUMBER '5' (1, 72) (1, 73)

426

NAME 'is' (1, 74) (1, 76)

427

NAME 'not' (1, 77) (1, 80)

428

NUMBER '1' (1, 81) (1, 82)

429

OP ':' (1, 82) (1, 83)

430

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

435

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

436

NAME 'x' (1, 0) (1, 1)

437

OP '=' (1, 2) (1, 3)

438

NUMBER '1' (1, 4) (1, 5)

439

OP '<<' (1, 6) (1, 8)

440

NUMBER '1' (1, 9) (1, 10)

441

OP '>>' (1, 11) (1, 13)

442

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

446

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

447

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

448

NAME 'x' (1, 0) (1, 1)

449

OP '=' (1, 2) (1, 3)

450

NUMBER '1' (1, 4) (1, 5)

451

OP '-' (1, 6) (1, 7)

452

NAME 'y' (1, 8) (1, 9)

453

OP '+' (1, 10) (1, 11)

454

NUMBER '15' (1, 12) (1, 14)

455

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

456

NUMBER '1' (1, 17) (1, 18)

457

OP '+' (1, 19) (1, 20)

458

NUMBER '0x124' (1, 21) (1, 26)

459

OP '+' (1, 27) (1, 28)

460

NAME 'z' (1, 29) (1, 30)

461

OP '+' (1, 31) (1, 32)

462

NAME 'a' (1, 33) (1, 34)

463

OP '[' (1, 34) (1, 35)

464

NUMBER '5' (1, 35) (1, 36)

465

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

Benjamin Peterson

2014-04-09 23:55:56 -0400

[diff] [blame]

469

>>> dump_tokens("x = 1//1*1/5*12%0x12@42")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

470

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

471

NAME 'x' (1, 0) (1, 1)

472

OP '=' (1, 2) (1, 3)

473

NUMBER '1' (1, 4) (1, 5)

474

OP '//' (1, 5) (1, 7)

475

NUMBER '1' (1, 7) (1, 8)

476

OP '*' (1, 8) (1, 9)

477

NUMBER '1' (1, 9) (1, 10)

478

OP '/' (1, 10) (1, 11)

479

NUMBER '5' (1, 11) (1, 12)

480

OP '*' (1, 12) (1, 13)

481

NUMBER '12' (1, 13) (1, 15)

482

OP '%' (1, 15) (1, 16)

483

NUMBER '0x12' (1, 16) (1, 20)

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

484

OP '@' (1, 20) (1, 21)

485

NUMBER '42' (1, 21) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

490

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

491

OP '~' (1, 0) (1, 1)

492

NUMBER '1' (1, 1) (1, 2)

493

OP '^' (1, 3) (1, 4)

494

NUMBER '1' (1, 5) (1, 6)

495

OP '&' (1, 7) (1, 8)

496

NUMBER '1' (1, 9) (1, 10)

497

OP '|' (1, 11) (1, 12)

498

NUMBER '1' (1, 12) (1, 13)

499

OP '^' (1, 14) (1, 15)

500

OP '-' (1, 16) (1, 17)

501

NUMBER '1' (1, 17) (1, 18)

502

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

503

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

504

OP '-' (1, 0) (1, 1)

505

NUMBER '1' (1, 1) (1, 2)

506

OP '*' (1, 2) (1, 3)

507

NUMBER '1' (1, 3) (1, 4)

508

OP '/' (1, 4) (1, 5)

509

NUMBER '1' (1, 5) (1, 6)

510

OP '+' (1, 6) (1, 7)

511

NUMBER '1' (1, 7) (1, 8)

512

OP '*' (1, 8) (1, 9)

513

NUMBER '1' (1, 9) (1, 10)

514

OP '//' (1, 10) (1, 12)

515

NUMBER '1' (1, 12) (1, 13)

516

OP '-' (1, 14) (1, 15)

517

OP '-' (1, 16) (1, 17)

518

OP '-' (1, 17) (1, 18)

519

OP '-' (1, 18) (1, 19)

520

NUMBER '1' (1, 19) (1, 20)

521

OP '**' (1, 20) (1, 22)

522

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

527

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

528

NAME 'import' (1, 0) (1, 6)

529

NAME 'sys' (1, 7) (1, 10)

530

OP ',' (1, 10) (1, 11)

531

NAME 'time' (1, 12) (1, 16)

532

NEWLINE '\\n' (1, 16) (1, 17)

533

NAME 'x' (2, 0) (2, 1)

534

OP '=' (2, 2) (2, 3)

535

NAME 'sys' (2, 4) (2, 7)

536

OP '.' (2, 7) (2, 8)

537

NAME 'modules' (2, 8) (2, 15)

538

OP '[' (2, 15) (2, 16)

539

STRING "'time'" (2, 16) (2, 22)

540

OP ']' (2, 22) (2, 23)

541

OP '.' (2, 23) (2, 24)

542

NAME 'time' (2, 24) (2, 28)

543

OP '(' (2, 28) (2, 29)

544

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

549

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

550

OP '@' (1, 0) (1, 1)

551

NAME 'staticmethod (1, 1) (1, 13)

552

NEWLINE '\\n' (1, 13) (1, 14)

553

NAME 'def' (2, 0) (2, 3)

554

NAME 'foo' (2, 4) (2, 7)

555

OP '(' (2, 7) (2, 8)

556

NAME 'x' (2, 8) (2, 9)

557

OP ',' (2, 9) (2, 10)

558

NAME 'y' (2, 10) (2, 11)

559

OP ')' (2, 11) (2, 12)

560

OP ':' (2, 12) (2, 13)

561

NAME 'pass' (2, 14) (2, 18)

562

563

Backslash means line continuation, except for comments

564

565

>>> roundtrip("x=1+\\\\n"

566

... "1\\n"

567

... "# This is a comment\\\\n"

568

... "# This also\\n")

569

True

570

>>> roundtrip("# Comment \\\\nx = 0")

571

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

572

573

Two string literals on the same line

574

575

>>> roundtrip("'' ''")

576

True

577

578

Test roundtrip on random python modules.

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

579

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

580

581

>>> import random

582

>>> tempdir = os.path.dirname(f) or os.curdir

583

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

584

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

585

Tokenize is broken on test_pep3131.py because regular expressions are

586

broken on the obscure unicode identifiers in it. *sigh*

587

With roundtrip extended to test the 5-tuple mode of untokenize,

588

7 more testfiles fail. Remove them also until the failure is diagnosed.

589

Benjamin Peterson

963e402

2011-08-13 00:33:21 -0500

[diff] [blame]

590

>>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

591

>>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):

592

... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)

593

...

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

594

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

595

... testfiles = random.sample(testfiles, 10)

596

...

597

>>> for testfile in testfiles:

598

... if not roundtrip(open(testfile, 'rb')):

599

... print("Roundtrip failed for file %s" % testfile)

600

... break

601

... else: True

602

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

603

604

Evil tabs

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

605

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

606

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

607

ENCODING 'utf-8' (0, 0) (0, 0)

608

NAME 'def' (1, 0) (1, 3)

609

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

614

INDENT '\\t' (2, 0) (2, 1)

615

NAME 'if' (2, 1) (2, 3)

616

NAME 'x' (2, 4) (2, 5)

617

NEWLINE '\\n' (2, 5) (2, 6)

618

INDENT ' \\t' (3, 0) (3, 9)

619

NAME 'pass' (3, 9) (3, 13)

620

DEDENT '' (4, 0) (4, 0)

621

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

622

623

Non-ascii identifiers

624

625

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

626

ENCODING 'utf-8' (0, 0) (0, 0)

627

NAME 'Örter' (1, 0) (1, 5)

628

OP '=' (1, 6) (1, 7)

629

STRING "'places'" (1, 8) (1, 16)

630

NEWLINE '\\n' (1, 16) (1, 17)

631

NAME 'grün' (2, 0) (2, 4)

632

OP '=' (2, 5) (2, 6)

633

STRING "'green'" (2, 7) (2, 14)

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

634

635

Legacy unicode literals:

636

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

637

>>> dump_tokens("Örter = u'places'\\ngrün = U'green'")

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

638

ENCODING 'utf-8' (0, 0) (0, 0)

639

NAME 'Örter' (1, 0) (1, 5)

640

OP '=' (1, 6) (1, 7)

641

STRING "u'places'" (1, 8) (1, 17)

642

NEWLINE '\\n' (1, 17) (1, 18)

643

NAME 'grün' (2, 0) (2, 4)

644

OP '=' (2, 5) (2, 6)

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

645

STRING "U'green'" (2, 7) (2, 15)

Yury Selivanov

7544508

2015-05-11 22:57:16 -0400

[diff] [blame]

646

647

Async/await extension:

648

649

>>> dump_tokens("async = 1")

650

ENCODING 'utf-8' (0, 0) (0, 0)

651

NAME 'async' (1, 0) (1, 5)

652

OP '=' (1, 6) (1, 7)

653

NUMBER '1' (1, 8) (1, 9)

654

655

>>> dump_tokens("a = (async = 1)")

656

ENCODING 'utf-8' (0, 0) (0, 0)

657

NAME 'a' (1, 0) (1, 1)

658

OP '=' (1, 2) (1, 3)

659

OP '(' (1, 4) (1, 5)

660

NAME 'async' (1, 5) (1, 10)

661

OP '=' (1, 11) (1, 12)

662

NUMBER '1' (1, 13) (1, 14)

663

OP ')' (1, 14) (1, 15)

664

665

>>> dump_tokens("async()")

666

ENCODING 'utf-8' (0, 0) (0, 0)

667

NAME 'async' (1, 0) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

>>> dump_tokens("class async(Bar):pass")

672

ENCODING 'utf-8' (0, 0) (0, 0)

673

NAME 'class' (1, 0) (1, 5)

674

NAME 'async' (1, 6) (1, 11)

675

OP '(' (1, 11) (1, 12)

676

NAME 'Bar' (1, 12) (1, 15)

677

OP ')' (1, 15) (1, 16)

678

OP ':' (1, 16) (1, 17)

679

NAME 'pass' (1, 17) (1, 21)

680

681

>>> dump_tokens("class async:pass")

682

ENCODING 'utf-8' (0, 0) (0, 0)

683

NAME 'class' (1, 0) (1, 5)

684

NAME 'async' (1, 6) (1, 11)

685

OP ':' (1, 11) (1, 12)

686

NAME 'pass' (1, 12) (1, 16)

687

688

>>> dump_tokens("await = 1")

689

ENCODING 'utf-8' (0, 0) (0, 0)

690

NAME 'await' (1, 0) (1, 5)

691

OP '=' (1, 6) (1, 7)

692

NUMBER '1' (1, 8) (1, 9)

693

694

>>> dump_tokens("foo.async")

695

ENCODING 'utf-8' (0, 0) (0, 0)

696

NAME 'foo' (1, 0) (1, 3)

697

OP '.' (1, 3) (1, 4)

698

NAME 'async' (1, 4) (1, 9)

699

700

>>> dump_tokens("async for a in b: pass")

701

ENCODING 'utf-8' (0, 0) (0, 0)

702

NAME 'async' (1, 0) (1, 5)

703

NAME 'for' (1, 6) (1, 9)

704

NAME 'a' (1, 10) (1, 11)

705

NAME 'in' (1, 12) (1, 14)

706

NAME 'b' (1, 15) (1, 16)

707

OP ':' (1, 16) (1, 17)

708

NAME 'pass' (1, 18) (1, 22)

709

710

>>> dump_tokens("async with a as b: pass")

711

ENCODING 'utf-8' (0, 0) (0, 0)

712

NAME 'async' (1, 0) (1, 5)

713

NAME 'with' (1, 6) (1, 10)

714

NAME 'a' (1, 11) (1, 12)

715

NAME 'as' (1, 13) (1, 15)

716

NAME 'b' (1, 16) (1, 17)

717

OP ':' (1, 17) (1, 18)

718

NAME 'pass' (1, 19) (1, 23)

719

720

>>> dump_tokens("async.foo")

721

ENCODING 'utf-8' (0, 0) (0, 0)

722

NAME 'async' (1, 0) (1, 5)

723

OP '.' (1, 5) (1, 6)

724

NAME 'foo' (1, 6) (1, 9)

725

726

>>> dump_tokens("async")

727

ENCODING 'utf-8' (0, 0) (0, 0)

728

NAME 'async' (1, 0) (1, 5)

729

730

>>> dump_tokens("async\\n#comment\\nawait")

731

ENCODING 'utf-8' (0, 0) (0, 0)

732

NAME 'async' (1, 0) (1, 5)

733

NEWLINE '\\n' (1, 5) (1, 6)

734

COMMENT '#comment' (2, 0) (2, 8)

735

NL '\\n' (2, 8) (2, 9)

736

NAME 'await' (3, 0) (3, 5)

737

738

>>> dump_tokens("async\\n...\\nawait")

739

ENCODING 'utf-8' (0, 0) (0, 0)

740

NAME 'async' (1, 0) (1, 5)

741

NEWLINE '\\n' (1, 5) (1, 6)

742

OP '...' (2, 0) (2, 3)

743

NEWLINE '\\n' (2, 3) (2, 4)

744

NAME 'await' (3, 0) (3, 5)

745

746

>>> dump_tokens("async\\nawait")

747

ENCODING 'utf-8' (0, 0) (0, 0)

748

NAME 'async' (1, 0) (1, 5)

749

NEWLINE '\\n' (1, 5) (1, 6)

750

NAME 'await' (2, 0) (2, 5)

751

752

>>> dump_tokens("foo.async + 1")

753

ENCODING 'utf-8' (0, 0) (0, 0)

754

NAME 'foo' (1, 0) (1, 3)

755

OP '.' (1, 3) (1, 4)

756

NAME 'async' (1, 4) (1, 9)

757

OP '+' (1, 10) (1, 11)

758

NUMBER '1' (1, 12) (1, 13)

759

760

>>> dump_tokens("async def foo(): pass")

761

ENCODING 'utf-8' (0, 0) (0, 0)

762

ASYNC 'async' (1, 0) (1, 5)

763

NAME 'def' (1, 6) (1, 9)

764

NAME 'foo' (1, 10) (1, 13)

765

OP '(' (1, 13) (1, 14)

766

OP ')' (1, 14) (1, 15)

767

OP ':' (1, 15) (1, 16)

768

NAME 'pass' (1, 17) (1, 21)

769

770

>>> dump_tokens('''async def foo():

... def foo(await):

... await = 1

... if 1:

... await

... async += 1

... ''')

ENCODING 'utf-8' (0, 0) (0, 0)

778

ASYNC 'async' (1, 0) (1, 5)

779

NAME 'def' (1, 6) (1, 9)

780

NAME 'foo' (1, 10) (1, 13)

781

OP '(' (1, 13) (1, 14)

782

OP ')' (1, 14) (1, 15)

783

OP ':' (1, 15) (1, 16)

784

NEWLINE '\\n' (1, 16) (1, 17)

785

INDENT ' ' (2, 0) (2, 2)

786

NAME 'def' (2, 2) (2, 5)

787

NAME 'foo' (2, 6) (2, 9)

788

OP '(' (2, 9) (2, 10)

789

NAME 'await' (2, 10) (2, 15)

790

OP ')' (2, 15) (2, 16)

791

OP ':' (2, 16) (2, 17)

792

NEWLINE '\\n' (2, 17) (2, 18)

793

INDENT ' ' (3, 0) (3, 4)

794

NAME 'await' (3, 4) (3, 9)

795

OP '=' (3, 10) (3, 11)

796

NUMBER '1' (3, 12) (3, 13)

797

NEWLINE '\\n' (3, 13) (3, 14)

798

DEDENT '' (4, 2) (4, 2)

799

NAME 'if' (4, 2) (4, 4)

800

NUMBER '1' (4, 5) (4, 6)

801

OP ':' (4, 6) (4, 7)

802

NEWLINE '\\n' (4, 7) (4, 8)

803

INDENT ' ' (5, 0) (5, 4)

804

AWAIT 'await' (5, 4) (5, 9)

805

NEWLINE '\\n' (5, 9) (5, 10)

806

DEDENT '' (6, 0) (6, 0)

807

DEDENT '' (6, 0) (6, 0)

808

NAME 'async' (6, 0) (6, 5)

809

OP '+=' (6, 6) (6, 8)

810

NUMBER '1' (6, 9) (6, 10)

811

NEWLINE '\\n' (6, 10) (6, 11)

812

813

>>> dump_tokens('''async def foo():

814

... async for i in 1: pass''')

815

ENCODING 'utf-8' (0, 0) (0, 0)

816

ASYNC 'async' (1, 0) (1, 5)

817

NAME 'def' (1, 6) (1, 9)

818

NAME 'foo' (1, 10) (1, 13)

819

OP '(' (1, 13) (1, 14)

820

OP ')' (1, 14) (1, 15)

821

OP ':' (1, 15) (1, 16)

822

NEWLINE '\\n' (1, 16) (1, 17)

823

INDENT ' ' (2, 0) (2, 2)

824

ASYNC 'async' (2, 2) (2, 7)

825

NAME 'for' (2, 8) (2, 11)

826

NAME 'i' (2, 12) (2, 13)

827

NAME 'in' (2, 14) (2, 16)

828

NUMBER '1' (2, 17) (2, 18)

829

OP ':' (2, 18) (2, 19)

830

NAME 'pass' (2, 20) (2, 24)

831

DEDENT '' (3, 0) (3, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

832

"""

833

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

834

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

835

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

836

STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

837

open as tokenize_open, Untokenizer)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

838

from io import BytesIO

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

839

from unittest import TestCase, mock

Jason R. Coombs

7cf3638

2015-06-20 19:13:50 -0400

[diff] [blame]

840

import os

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

841

import token

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

842

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

843

def dump_tokens(s):

844

"""Print out the tokens in s in a table format.

845

846

The ENDMARKER is omitted.

847

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

848

f = BytesIO(s.encode('utf-8'))

849

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

850

if type == ENDMARKER:

851

break

852

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

853

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

854

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

855

def roundtrip(f):

856

"""

857

Test roundtrip for `untokenize`. `f` is an open file or a string.

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

858

The source code in f is tokenized to both 5- and 2-tuples.

859

Both sequences are converted back to source code via

860

tokenize.untokenize(), and the latter tokenized again to 2-tuples.

861

The test fails if the 3 pair tokenizations do not match.

862

863

When untokenize bugs are fixed, untokenize with 5-tuples should

864

reproduce code that does not contain a backslash continuation

865

following spaces. A proper test should test this.

866

867

This function would be more useful for correcting bugs if it reported

868

the first point of failure, like assertEqual, rather than just

869

returning False -- or if it were only used in unittests and not

870

doctest and actually used assertEqual.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

871

"""

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

872

# Get source code and original tokenizations

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

873

if isinstance(f, str):

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

874

code = f.encode('utf-8')

875

else:

876

code = f.read()

Brian Curtin

9f5f65c

2010-10-30 21:35:28 +0000

[diff] [blame]

877

f.close()

Terry Jan Reedy

2014-02-23 18:00:31 -0500

[diff] [blame]

878

readline = iter(code.splitlines(keepends=True)).__next__

879

tokens5 = list(tokenize(readline))

880

tokens2 = [tok[:2] for tok in tokens5]

881

# Reproduce tokens2 from pairs

882

bytes_from2 = untokenize(tokens2)

883

readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__

884

tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]

885

# Reproduce tokens2 from 5-tuples

886

bytes_from5 = untokenize(tokens5)

887

readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__

888

tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]

889

# Compare 3 versions

890

return tokens2 == tokens2_from2 == tokens2_from5

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

891

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

892

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

893

def decistmt(s):

894

"""Substitute Decimals for floats in a string of statements.

895

896

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

897

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

898

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

899

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

900

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

901

The format of the exponent is inherited from the platform C library.

902

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

903

we're only showing 11 digits, and the 12th isn't close to 5, the

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

904

rest of the output should be platform-independent.

905

906

>>> exec(s) #doctest: +ELLIPSIS

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

907

-3.2171603427...e-0...7

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

908

909

Output from calculations with Decimal should be identical across all

910

platforms.

911

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

912

>>> exec(decistmt(s))

913

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

914

"""

915

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

916

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

917

for toknum, tokval, _, _, _ in g:

918

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

927

return untokenize(result).decode('utf-8')

928

929

930

class TestTokenizerAdheresToPep0263(TestCase):

931

"""

932

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

933

"""

934

935

def _testFile(self, filename):

936

path = os.path.join(os.path.dirname(__file__), filename)

937

return roundtrip(open(path, 'rb'))

938

939

def test_utf8_coding_cookie_and_no_utf8_bom(self):

Ned Deily

2ea6fcc

2011-07-19 16:15:27 -0700

[diff] [blame]

940

f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

941

self.assertTrue(self._testFile(f))

942

943

def test_latin1_coding_cookie_and_utf8_bom(self):

944

"""

945

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

946

allowed encoding for the comment is 'utf-8'. The text file used in

947

this test starts with a BOM signature, but specifies latin1 as the

948

coding, so verify that a SyntaxError is raised, which matches the

949

behaviour of the interpreter when it encounters a similar condition.

950

"""

951

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

952

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

953

954

def test_no_coding_cookie_and_utf8_bom(self):

955

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

956

self.assertTrue(self._testFile(f))

957

958

def test_utf8_coding_cookie_and_utf8_bom(self):

959

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

960

self.assertTrue(self._testFile(f))

961

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

962

def test_bad_coding_cookie(self):

963

self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')

964

self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')

965

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

966

967

class Test_Tokenize(TestCase):

968

969

def test__tokenize_decodes_with_specified_encoding(self):

970

literal = '"ЉЊЈЁЂ"'

971

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

982

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

983

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

984

self.assertEqual(tokens, expected_tokens,

985

"bytes not decoded with encoding")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

986

987

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

1000

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1001

self.assertEqual(tokens, expected_tokens,

1002

"string not tokenized when encoding is None")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1003

1004

1005

class TestDetectEncoding(TestCase):

1006

1007

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

1019

lines = (

1020

b'# something\n',

1021

b'print(something)\n',

1022

b'do_something(else)\n'

1023

)

1024

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1025

self.assertEqual(encoding, 'utf-8')

1026

self.assertEqual(consumed_lines, list(lines[:2]))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1027

1028

def test_bom_no_cookie(self):

1029

lines = (

1030

b'\xef\xbb\xbf# something\n',

1031

b'print(something)\n',

1032

b'do_something(else)\n'

1033

)

1034

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1035

self.assertEqual(encoding, 'utf-8-sig')

1036

self.assertEqual(consumed_lines,

1037

[b'# something\n', b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1038

1039

def test_cookie_first_line_no_bom(self):

1040

lines = (

1041

b'# -*- coding: latin-1 -*-\n',

1042

b'print(something)\n',

1043

b'do_something(else)\n'

1044

)

1045

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1046

self.assertEqual(encoding, 'iso-8859-1')

1047

self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1048

1049

def test_matched_bom_and_cookie_first_line(self):

1050

lines = (

1051

b'\xef\xbb\xbf# coding=utf-8\n',

1052

b'print(something)\n',

1053

b'do_something(else)\n'

1054

)

1055

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1056

self.assertEqual(encoding, 'utf-8-sig')

1057

self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1058

1059

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

1060

lines = (

1061

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

1062

b'print(something)\n',

1063

b'do_something(else)\n'

1064

)

1065

readline = self.get_readline(lines)

1066

self.assertRaises(SyntaxError, detect_encoding, readline)

1067

1068

def test_cookie_second_line_no_bom(self):

1069

lines = (

1070

b'#! something\n',

1071

b'# vim: set fileencoding=ascii :\n',

1072

b'print(something)\n',

1073

b'do_something(else)\n'

1074

)

1075

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1076

self.assertEqual(encoding, 'ascii')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1077

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1078

self.assertEqual(consumed_lines, expected)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1079

1080

def test_matched_bom_and_cookie_second_line(self):

1081

lines = (

1082

b'\xef\xbb\xbf#! something\n',

1083

b'f# coding=utf-8\n',

1084

b'print(something)\n',

1085

b'do_something(else)\n'

1086

)

1087

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1088

self.assertEqual(encoding, 'utf-8-sig')

1089

self.assertEqual(consumed_lines,

1090

[b'#! something\n', b'f# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1091

1092

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

1093

lines = (

1094

b'\xef\xbb\xbf#! something\n',

1095

b'# vim: set fileencoding=ascii :\n',

1096

b'print(something)\n',

1097

b'do_something(else)\n'

1098

)

1099

readline = self.get_readline(lines)

1100

self.assertRaises(SyntaxError, detect_encoding, readline)

1101

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

1102

def test_cookie_second_line_noncommented_first_line(self):

1103

lines = (

1104

b"print('\xc2\xa3')\n",

1105

b'# vim: set fileencoding=iso8859-15 :\n',

1106

b"print('\xe2\x82\xac')\n"

1107

)

1108

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

1109

self.assertEqual(encoding, 'utf-8')

1110

expected = [b"print('\xc2\xa3')\n"]

1111

self.assertEqual(consumed_lines, expected)

1112

1113

def test_cookie_second_line_commented_first_line(self):

1114

lines = (

1115

b"#print('\xc2\xa3')\n",

1116

b'# vim: set fileencoding=iso8859-15 :\n',

1117

b"print('\xe2\x82\xac')\n"

1118

)

1119

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

1120

self.assertEqual(encoding, 'iso8859-15')

1121

expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']

1122

self.assertEqual(consumed_lines, expected)

1123

1124

def test_cookie_second_line_empty_first_line(self):

1125

lines = (

1126

b'\n',

1127

b'# vim: set fileencoding=iso8859-15 :\n',

1128

b"print('\xe2\x82\xac')\n"

1129

)

1130

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

1131

self.assertEqual(encoding, 'iso8859-15')

1132

expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']

1133

self.assertEqual(consumed_lines, expected)

1134

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

1135

def test_latin1_normalization(self):

1136

# See get_normal_name() in tokenizer.c.

1137

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

1138

"iso-8859-1-unix", "iso-latin-1-mac")

1139

for encoding in encodings:

1140

for rep in ("-", "_"):

1141

enc = encoding.replace("-", rep)

1142

lines = (b"#!/usr/bin/python\n",

1143

b"# coding: " + enc.encode("ascii") + b"\n",

1144

b"print(things)\n",

1145

b"do_something += 4\n")

1146

rl = self.get_readline(lines)

1147

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1148

self.assertEqual(found, "iso-8859-1")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

1149

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

1150

def test_syntaxerror_latin1(self):

1151

# Issue 14629: need to raise SyntaxError if the first

1152

# line(s) have non-UTF-8 characters

1153

lines = (

1154

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

1155

)

1156

readline = self.get_readline(lines)

1157

self.assertRaises(SyntaxError, detect_encoding, readline)

1158

1159

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

1160

def test_utf8_normalization(self):

1161

# See get_normal_name() in tokenizer.c.

1162

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

1163

for encoding in encodings:

1164

for rep in ("-", "_"):

1165

enc = encoding.replace("-", rep)

1166

lines = (b"#!/usr/bin/python\n",

1167

b"# coding: " + enc.encode("ascii") + b"\n",

1168

b"1 + 3\n")

1169

rl = self.get_readline(lines)

1170

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1171

self.assertEqual(found, "utf-8")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

1172

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1173

def test_short_files(self):

1174

readline = self.get_readline((b'print(something)\n',))

1175

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1176

self.assertEqual(encoding, 'utf-8')

1177

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1178

1179

encoding, consumed_lines = detect_encoding(self.get_readline(()))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1180

self.assertEqual(encoding, 'utf-8')

1181

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1182

1183

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

1184

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1185

self.assertEqual(encoding, 'utf-8-sig')

1186

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1187

1188

readline = self.get_readline((b'\xef\xbb\xbf',))

1189

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1190

self.assertEqual(encoding, 'utf-8-sig')

1191

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1192

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

1193

readline = self.get_readline((b'# coding: bad\n',))

1194

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1195

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

1196

def test_false_encoding(self):

1197

# Issue 18873: "Encoding" detected in non-comment lines

1198

readline = self.get_readline((b'print("#coding=fake")',))

1199

encoding, consumed_lines = detect_encoding(readline)

1200

self.assertEqual(encoding, 'utf-8')

1201

self.assertEqual(consumed_lines, [b'print("#coding=fake")'])

1202

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

1203

def test_open(self):

1204

filename = support.TESTFN + '.py'

1205

self.addCleanup(support.unlink, filename)

1206

1207

# test coding cookie

1208

for encoding in ('iso-8859-15', 'utf-8'):

1209

with open(filename, 'w', encoding=encoding) as fp:

1210

print("# coding: %s" % encoding, file=fp)

1211

print("print('euro:\u20ac')", file=fp)

1212

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

1213

self.assertEqual(fp.encoding, encoding)

1214

self.assertEqual(fp.mode, 'r')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

1215

1216

# test BOM (no coding cookie)

1217

with open(filename, 'w', encoding='utf-8-sig') as fp:

1218

print("print('euro:\u20ac')", file=fp)

1219

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

1220

self.assertEqual(fp.encoding, 'utf-8-sig')

1221

self.assertEqual(fp.mode, 'r')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

1222

Brett Cannon

c33f3f2

2012-04-20 13:23:54 -0400

[diff] [blame]

1223

def test_filename_in_exception(self):

1224

# When possible, include the file name in the exception.

1225

path = 'some_file_path'

1226

lines = (

1227

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

1228

)

1229

class Bunk:

1230

def __init__(self, lines, path):

self.name = path

self._lines = lines

self._index = 0

def readline(self):

if self._index == len(lines):

1237

raise StopIteration

1238

line = lines[self._index]

self._index += 1

return line

with self.assertRaises(SyntaxError):

1243

ins = Bunk(lines, path)

1244

# Make sure lacking a name isn't an issue.

1245

del ins.name

1246

detect_encoding(ins.readline)

1247

with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):

1248

ins = Bunk(lines, path)

1249

detect_encoding(ins.readline)

1250

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

1251

def test_open_error(self):

1252

# Issue #23840: open() must close the binary file on error

1253

m = BytesIO(b'#coding:xxx')

1254

with mock.patch('tokenize._builtin_open', return_value=m):

1255

self.assertRaises(SyntaxError, tokenize_open, 'foobar')

1256

self.assertTrue(m.closed)

1257

1258

Brett Cannon

c33f3f2

2012-04-20 13:23:54 -0400

[diff] [blame]

1259

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1260

class TestTokenize(TestCase):

1261

1262

def test_tokenize(self):

1263

import tokenize as tokenize_module

1264

encoding = object()

1265

encoding_used = None

1266

def mock_detect_encoding(readline):

Serhiy Storchaka

74a49ac

2015-03-20 16:46:19 +0200

[diff] [blame]

1267

return encoding, [b'first', b'second']

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1268

1269

def mock__tokenize(readline, encoding):

1270

nonlocal encoding_used

1271

encoding_used = encoding

1272

out = []

1273

while True:

1274

next_line = readline()

1275

if next_line:

1276

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

Serhiy Storchaka

74a49ac

2015-03-20 16:46:19 +0200

[diff] [blame]

1286

return str(counter).encode()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1287

1288

orig_detect_encoding = tokenize_module.detect_encoding

1289

orig__tokenize = tokenize_module._tokenize

1290

tokenize_module.detect_encoding = mock_detect_encoding

1291

tokenize_module._tokenize = mock__tokenize

1292

try:

1293

results = tokenize(mock_readline)

Serhiy Storchaka

74a49ac

2015-03-20 16:46:19 +0200

[diff] [blame]

1294

self.assertEqual(list(results),

1295

[b'first', b'second', b'1', b'2', b'3', b'4'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1296

finally:

1297

tokenize_module.detect_encoding = orig_detect_encoding

1298

tokenize_module._tokenize = orig__tokenize

1299

1300

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

1301

Yury Selivanov

8085b80

2015-05-18 12:50:52 -0400

[diff] [blame]

1302

def test_oneline_defs(self):

1303

buf = []

1304

for i in range(500):

1305

buf.append('def i{i}(): return {i}'.format(i=i))

buf.append('OK')

buf = '\n'.join(buf)

# Test that 500 consequent, one-line defs is OK

1310

toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))

1311

self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER

1312

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

1313

def assertExactTypeEqual(self, opstr, *optypes):

1314

tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))

1315

num_optypes = len(optypes)

1316

self.assertEqual(len(tokens), 2 + num_optypes)

1317

self.assertEqual(token.tok_name[tokens[0].exact_type],

1318

token.tok_name[ENCODING])

1319

for i in range(num_optypes):

1320

self.assertEqual(token.tok_name[tokens[i + 1].exact_type],

1321

token.tok_name[optypes[i]])

1322

self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],

1323

token.tok_name[token.ENDMARKER])

1324

1325

def test_exact_type(self):

1326

self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

1327

self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)

1328

self.assertExactTypeEqual(':', token.COLON)

1329

self.assertExactTypeEqual(',', token.COMMA)

1330

self.assertExactTypeEqual(';', token.SEMI)

1331

self.assertExactTypeEqual('+', token.PLUS)

1332

self.assertExactTypeEqual('-', token.MINUS)

1333

self.assertExactTypeEqual('*', token.STAR)

1334

self.assertExactTypeEqual('/', token.SLASH)

1335

self.assertExactTypeEqual('|', token.VBAR)

1336

self.assertExactTypeEqual('&', token.AMPER)

1337

self.assertExactTypeEqual('<', token.LESS)

1338

self.assertExactTypeEqual('>', token.GREATER)

1339

self.assertExactTypeEqual('=', token.EQUAL)

1340

self.assertExactTypeEqual('.', token.DOT)

1341

self.assertExactTypeEqual('%', token.PERCENT)

1342

self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)

1343

self.assertExactTypeEqual('==', token.EQEQUAL)

1344

self.assertExactTypeEqual('!=', token.NOTEQUAL)

1345

self.assertExactTypeEqual('<=', token.LESSEQUAL)

1346

self.assertExactTypeEqual('>=', token.GREATEREQUAL)

1347

self.assertExactTypeEqual('~', token.TILDE)

1348

self.assertExactTypeEqual('^', token.CIRCUMFLEX)

1349

self.assertExactTypeEqual('<<', token.LEFTSHIFT)

1350

self.assertExactTypeEqual('>>', token.RIGHTSHIFT)

1351

self.assertExactTypeEqual('**', token.DOUBLESTAR)

1352

self.assertExactTypeEqual('+=', token.PLUSEQUAL)

1353

self.assertExactTypeEqual('-=', token.MINEQUAL)

1354

self.assertExactTypeEqual('*=', token.STAREQUAL)

1355

self.assertExactTypeEqual('/=', token.SLASHEQUAL)

1356

self.assertExactTypeEqual('%=', token.PERCENTEQUAL)

1357

self.assertExactTypeEqual('&=', token.AMPEREQUAL)

1358

self.assertExactTypeEqual('|=', token.VBAREQUAL)

1359

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1360

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1361

self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)

1362

self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)

1363

self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)

1364

self.assertExactTypeEqual('//', token.DOUBLESLASH)

1365

self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)

1366

self.assertExactTypeEqual('@', token.AT)

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

1367

self.assertExactTypeEqual('@=', token.ATEQUAL)

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

1368

1369

self.assertExactTypeEqual('a**2+b**2==c**2',

1370

NAME, token.DOUBLESTAR, NUMBER,

1371

token.PLUS,

1372

NAME, token.DOUBLESTAR, NUMBER,

1373

token.EQEQUAL,

1374

NAME, token.DOUBLESTAR, NUMBER)

1375

self.assertExactTypeEqual('{1, 2, 3}',

1376

token.LBRACE,

1377

token.NUMBER, token.COMMA,

1378

token.NUMBER, token.COMMA,

1379

token.NUMBER,

1380

token.RBRACE)

1381

self.assertExactTypeEqual('^(x & 0x1)',

1382

token.CIRCUMFLEX,

1383

token.LPAR,

1384

token.NAME, token.AMPER, token.NUMBER,

1385

token.RPAR)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1386

Ezio Melotti

fafa8b7

2012-11-03 17:46:51 +0200

[diff] [blame]

1387

def test_pathological_trailing_whitespace(self):

1388

# See http://bugs.python.org/issue16152

1389

self.assertExactTypeEqual('@ ', token.AT)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1390

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1391

class UntokenizeTest(TestCase):

Terry Jan Reedy

58edfd9

2014-02-17 16:49:06 -0500

[diff] [blame]

1392

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1393

def test_bad_input_order(self):

Terry Jan Reedy

2014-02-23 23:33:08 -0500

[diff] [blame]

1394

# raise if previous row

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

u = Untokenizer()

u.prev_row = 2

u.prev_col = 2

with self.assertRaises(ValueError) as cm:

1399

u.add_whitespace((1,3))

Terry Jan Reedy

58edfd9

2014-02-17 16:49:06 -0500

[diff] [blame]

1400

self.assertEqual(cm.exception.args[0],

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1401

'start (1,3) precedes previous end (2,2)')

Terry Jan Reedy

2014-02-23 23:33:08 -0500

[diff] [blame]

1402

# raise if previous column in row

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1403

self.assertRaises(ValueError, u.add_whitespace, (2,1))

1404

Terry Jan Reedy

2014-02-23 23:33:08 -0500

[diff] [blame]

1405

def test_backslash_continuation(self):

1406

# The problem is that <whitespace>\<newline> leaves no token

u = Untokenizer()

u.prev_row = 1

u.prev_col = 1

u.tokens = []

u.add_whitespace((2, 0))

1412

self.assertEqual(u.tokens, ['\\\n'])

1413

u.prev_row = 2

1414

u.add_whitespace((4, 4))

1415

self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])

1416

self.assertTrue(roundtrip('a\n b\n c\n \\\n c\n'))

1417

Terry Jan Reedy

5b8d2c3

2014-02-17 23:12:16 -0500

[diff] [blame]

1418

def test_iter_compat(self):

1419

u = Untokenizer()

1420

token = (NAME, 'Hello')

1421

tokens = [(ENCODING, 'utf-8'), token]

1422

u.compat(token, iter([]))

1423

self.assertEqual(u.tokens, ["Hello "])

1424

u = Untokenizer()

1425

self.assertEqual(u.untokenize(iter([token])), 'Hello ')

1426

u = Untokenizer()

1427

self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')

1428

self.assertEqual(u.encoding, 'utf-8')

1429

self.assertEqual(untokenize(iter(tokens)), b'Hello ')

1430

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1431

Jason R. Coombs

2015-06-20 19:52:22 -0400

[diff] [blame]

1432

class TestRoundtrip(TestCase):

1433

def roundtrip(self, code):

1434

if isinstance(code, str):

1435

code = code.encode('utf-8')

Jason R. Coombs

b6d1cdd

2015-06-25 22:42:24 -0400

[diff] [blame]

1436

return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')

Jason R. Coombs

2015-06-20 19:52:22 -0400

[diff] [blame]

1437

1438

def test_indentation_semantics_retained(self):

1439

"""

1440

Ensure that although whitespace might be mutated in a roundtrip,

1441

the semantic meaning of the indentation remains consistent.

1442

"""

1443

code = "if False:\n\tx=3\n\tx=3\n"

Jason R. Coombs

b6d1cdd

2015-06-25 22:42:24 -0400

[diff] [blame]

1444

codelines = self.roundtrip(code).split('\n')

Jason R. Coombs

2015-06-20 19:52:22 -0400

[diff] [blame]

1445

self.assertEqual(codelines[1], codelines[2])

1446

1447

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1448

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

1449

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

1450

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1451

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

1452

support.run_doctest(test_tokenize, True)

1453

support.run_unittest(TestTokenizerAdheresToPep0263)

1454

support.run_unittest(Test_Tokenize)

1455

support.run_unittest(TestDetectEncoding)

1456

support.run_unittest(TestTokenize)

Terry Jan Reedy

2014-02-17 16:45:48 -0500

[diff] [blame]

1457

support.run_unittest(UntokenizeTest)

Jason R. Coombs

2015-06-20 19:52:22 -0400

[diff] [blame]

1458

support.run_unittest(TestRoundtrip)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

1459

Thomas Wouters