Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-16 00:07:10 +0000

[diff] [blame]

1

doctests = """

2

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

3

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

4

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

5

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

6

brevity.

7

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

8

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

9

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

NUMBER '1' (1, 0) (1, 1)

11

OP '+' (1, 2) (1, 3)

12

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

13

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

14

>>> dump_tokens("if False:\\n"

15

... " # NL\\n"

16

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

17

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

18

NAME 'if' (1, 0) (1, 2)

19

NAME 'False' (1, 3) (1, 8)

20

OP ':' (1, 8) (1, 9)

21

NEWLINE '\\n' (1, 9) (1, 10)

22

COMMENT '# NL' (2, 4) (2, 8)

23

NL '\\n' (2, 8) (2, 9)

24

INDENT ' ' (3, 0) (3, 4)

25

NAME 'True' (3, 4) (3, 8)

26

OP '=' (3, 9) (3, 10)

27

NAME 'False' (3, 11) (3, 16)

28

COMMENT '# NEWLINE' (3, 17) (3, 26)

29

NEWLINE '\\n' (3, 26) (3, 27)

30

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

31

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

32

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

37

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

38

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

39

Traceback (most recent call last):

40

...

41

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

42

Mark Dickinson

3c0b317

2010-06-29 07:38:37 +0000

[diff] [blame]

43

There are some standard formatting practices that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

>>> roundtrip("if x == 1:\\n"

46

... " print(x)\\n")

47

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

48

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

49

>>> roundtrip("# This is a comment\\n# This also")

50

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

51

52

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

53

untokenize a little trickier. Note that this test involves trailing

54

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

55

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

56

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

57

>>> roundtrip("if x == 1 : \\n"

58

... " print(x)\\n")

59

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

60

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

61

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

62

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

63

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

64

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

>>> roundtrip("if x == 1:\\n"

66

... " # A comment by itself.\\n"

67

... " print(x) # Comment here, too.\\n"

68

... " # Another comment.\\n"

69

... "after_if = True\\n")

70

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

71

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

72

>>> roundtrip("if (x # The comments need to go in the right place\\n"

73

... " == 1):\\n"

74

... " print('x==1')\\n")

75

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

76

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

77

>>> roundtrip("class Test: # A comment here\\n"

78

... " # A comment with weird indent\\n"

79

... " after_com = 5\\n"

80

... " def x(m): return m*5 # a one liner\\n"

81

... " def y(m): # A whitespace after the colon\\n"

82

... " return y*4 # 3-space indent\\n")

83

True

84

85

Some error-handling code

86

87

>>> roundtrip("try: import somemodule\\n"

88

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

89

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

90

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

91

True

92

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

93

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

94

95

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

100

... "'b':15, 'c':True}\\n"

101

... "x = len(y) + 5 - a[\\n"

102

... "3] - a[2]\\n"

103

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

108

109

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

110

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

111

NUMBER '0xff' (1, 0) (1, 4)

112

OP '<=' (1, 5) (1, 7)

113

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

114

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

115

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

NUMBER '0b10' (1, 0) (1, 4)

117

OP '<=' (1, 5) (1, 7)

118

NUMBER '255' (1, 8) (1, 11)

119

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

120

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

121

NUMBER '0o123' (1, 0) (1, 5)

122

OP '<=' (1, 6) (1, 8)

123

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

124

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

125

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

NUMBER '1234567' (1, 0) (1, 7)

127

OP '>' (1, 8) (1, 9)

128

OP '~' (1, 10) (1, 11)

129

NUMBER '0x15' (1, 11) (1, 15)

130

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

131

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

132

NUMBER '2134568' (1, 0) (1, 7)

133

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

134

NUMBER '1231515' (1, 11) (1, 18)

135

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

136

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

137

OP '(' (1, 0) (1, 1)

138

OP '-' (1, 1) (1, 2)

139

NUMBER '124561' (1, 2) (1, 8)

140

OP '-' (1, 8) (1, 9)

141

NUMBER '1' (1, 9) (1, 10)

142

OP ')' (1, 10) (1, 11)

143

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

144

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

145

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

146

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

NUMBER '0xdeadbeef' (1, 0) (1, 10)

148

OP '!=' (1, 11) (1, 13)

149

OP '-' (1, 14) (1, 15)

150

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

151

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

152

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

153

NUMBER '0xdeadc0de' (1, 0) (1, 10)

154

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

155

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

156

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

157

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

NUMBER '0xFF' (1, 0) (1, 4)

159

OP '&' (1, 5) (1, 6)

160

NUMBER '0x15' (1, 7) (1, 11)

161

OP '|' (1, 12) (1, 13)

162

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

166

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

167

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

168

NAME 'x' (1, 0) (1, 1)

169

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

170

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

171

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

172

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

NAME 'x' (1, 0) (1, 1)

174

OP '=' (1, 2) (1, 3)

175

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

176

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

177

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

178

NAME 'x' (1, 0) (1, 1)

179

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

180

NUMBER '123141242151 (1, 4) (1, 25)

181

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

182

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

183

NAME 'x' (1, 0) (1, 1)

184

OP '=' (1, 2) (1, 3)

185

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

186

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

187

188

Floating point numbers

189

190

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

191

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

192

NAME 'x' (1, 0) (1, 1)

193

OP '=' (1, 2) (1, 3)

194

NUMBER '3.14159' (1, 4) (1, 11)

195

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

196

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

197

NAME 'x' (1, 0) (1, 1)

198

OP '=' (1, 2) (1, 3)

199

NUMBER '314159.' (1, 4) (1, 11)

200

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

201

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

202

NAME 'x' (1, 0) (1, 1)

203

OP '=' (1, 2) (1, 3)

204

NUMBER '.314159' (1, 4) (1, 11)

205

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

206

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

207

NAME 'x' (1, 0) (1, 1)

208

OP '=' (1, 2) (1, 3)

209

NUMBER '3e14159' (1, 4) (1, 11)

210

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

211

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

212

NAME 'x' (1, 0) (1, 1)

213

OP '=' (1, 2) (1, 3)

214

NUMBER '3E123' (1, 4) (1, 9)

215

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

216

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

217

NAME 'x' (1, 0) (1, 1)

218

OP '+' (1, 1) (1, 2)

219

NAME 'y' (1, 2) (1, 3)

220

OP '=' (1, 4) (1, 5)

221

NUMBER '3e-1230' (1, 6) (1, 13)

222

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

224

NAME 'x' (1, 0) (1, 1)

225

OP '=' (1, 2) (1, 3)

226

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

232

NAME 'x' (1, 0) (1, 1)

233

OP '=' (1, 2) (1, 3)

234

STRING "''" (1, 4) (1, 6)

235

OP ';' (1, 6) (1, 7)

236

NAME 'y' (1, 8) (1, 9)

237

OP '=' (1, 10) (1, 11)

238

STRING '""' (1, 12) (1, 14)

239

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

240

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

241

NAME 'x' (1, 0) (1, 1)

242

OP '=' (1, 2) (1, 3)

243

STRING '\\'"\\'' (1, 4) (1, 7)

244

OP ';' (1, 7) (1, 8)

245

NAME 'y' (1, 9) (1, 10)

246

OP '=' (1, 11) (1, 12)

247

STRING '"\\'"' (1, 13) (1, 16)

248

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

249

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

250

NAME 'x' (1, 0) (1, 1)

251

OP '=' (1, 2) (1, 3)

252

STRING '"doesn\\'t "' (1, 4) (1, 14)

253

NAME 'shrink' (1, 14) (1, 20)

254

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

255

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

256

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

257

NAME 'x' (1, 0) (1, 1)

258

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

259

STRING "'abc'" (1, 4) (1, 9)

260

OP '+' (1, 10) (1, 11)

261

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

262

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

263

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

NAME 'y' (1, 0) (1, 1)

265

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

266

STRING '"ABC"' (1, 4) (1, 9)

267

OP '+' (1, 10) (1, 11)

268

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

269

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

270

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

NAME 'x' (1, 0) (1, 1)

272

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

273

STRING "r'abc'" (1, 4) (1, 10)

274

OP '+' (1, 11) (1, 12)

275

STRING "r'ABC'" (1, 13) (1, 19)

276

OP '+' (1, 20) (1, 21)

277

STRING "R'ABC'" (1, 22) (1, 28)

278

OP '+' (1, 29) (1, 30)

279

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

280

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

281

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

NAME 'y' (1, 0) (1, 1)

283

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

284

STRING 'r"abc"' (1, 4) (1, 10)

285

OP '+' (1, 11) (1, 12)

286

STRING 'r"ABC"' (1, 13) (1, 19)

287

OP '+' (1, 20) (1, 21)

288

STRING 'R"ABC"' (1, 22) (1, 28)

289

OP '+' (1, 29) (1, 30)

290

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

291

Meador Inge

8d5c0b8

2012-06-16 21:49:08 -0500

[diff] [blame]

292

>>> dump_tokens("u'abc' + U'abc'")

293

ENCODING 'utf-8' (0, 0) (0, 0)

294

STRING "u'abc'" (1, 0) (1, 6)

295

OP '+' (1, 7) (1, 8)

296

STRING "U'abc'" (1, 9) (1, 15)

297

>>> dump_tokens('u"abc" + U"abc"')

298

ENCODING 'utf-8' (0, 0) (0, 0)

299

STRING 'u"abc"' (1, 0) (1, 6)

300

OP '+' (1, 7) (1, 8)

301

STRING 'U"abc"' (1, 9) (1, 15)

Meador Inge

8d5c0b8

2012-06-16 21:49:08 -0500

[diff] [blame]

302

303

>>> dump_tokens("b'abc' + B'abc'")

304

ENCODING 'utf-8' (0, 0) (0, 0)

305

STRING "b'abc'" (1, 0) (1, 6)

306

OP '+' (1, 7) (1, 8)

307

STRING "B'abc'" (1, 9) (1, 15)

308

>>> dump_tokens('b"abc" + B"abc"')

309

ENCODING 'utf-8' (0, 0) (0, 0)

310

STRING 'b"abc"' (1, 0) (1, 6)

311

OP '+' (1, 7) (1, 8)

312

STRING 'B"abc"' (1, 9) (1, 15)

313

>>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")

314

ENCODING 'utf-8' (0, 0) (0, 0)

315

STRING "br'abc'" (1, 0) (1, 7)

316

OP '+' (1, 8) (1, 9)

317

STRING "bR'abc'" (1, 10) (1, 17)

318

OP '+' (1, 18) (1, 19)

319

STRING "Br'abc'" (1, 20) (1, 27)

320

OP '+' (1, 28) (1, 29)

321

STRING "BR'abc'" (1, 30) (1, 37)

322

>>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')

323

ENCODING 'utf-8' (0, 0) (0, 0)

324

STRING 'br"abc"' (1, 0) (1, 7)

325

OP '+' (1, 8) (1, 9)

326

STRING 'bR"abc"' (1, 10) (1, 17)

327

OP '+' (1, 18) (1, 19)

328

STRING 'Br"abc"' (1, 20) (1, 27)

329

OP '+' (1, 28) (1, 29)

330

STRING 'BR"abc"' (1, 30) (1, 37)

331

>>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")

332

ENCODING 'utf-8' (0, 0) (0, 0)

333

STRING "rb'abc'" (1, 0) (1, 7)

334

OP '+' (1, 8) (1, 9)

335

STRING "rB'abc'" (1, 10) (1, 17)

336

OP '+' (1, 18) (1, 19)

337

STRING "Rb'abc'" (1, 20) (1, 27)

338

OP '+' (1, 28) (1, 29)

339

STRING "RB'abc'" (1, 30) (1, 37)

340

>>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')

341

ENCODING 'utf-8' (0, 0) (0, 0)

342

STRING 'rb"abc"' (1, 0) (1, 7)

343

OP '+' (1, 8) (1, 9)

344

STRING 'rB"abc"' (1, 10) (1, 17)

345

OP '+' (1, 18) (1, 19)

346

STRING 'Rb"abc"' (1, 20) (1, 27)

347

OP '+' (1, 28) (1, 29)

348

STRING 'RB"abc"' (1, 30) (1, 37)

349

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

350

Operators

351

352

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

353

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

354

NAME 'def' (1, 0) (1, 3)

355

NAME 'd22' (1, 4) (1, 7)

356

OP '(' (1, 7) (1, 8)

357

NAME 'a' (1, 8) (1, 9)

358

OP ',' (1, 9) (1, 10)

359

NAME 'b' (1, 11) (1, 12)

360

OP ',' (1, 12) (1, 13)

361

NAME 'c' (1, 14) (1, 15)

362

OP '=' (1, 15) (1, 16)

363

NUMBER '2' (1, 16) (1, 17)

364

OP ',' (1, 17) (1, 18)

365

NAME 'd' (1, 19) (1, 20)

366

OP '=' (1, 20) (1, 21)

367

NUMBER '2' (1, 21) (1, 22)

368

OP ',' (1, 22) (1, 23)

369

OP '*' (1, 24) (1, 25)

370

NAME 'k' (1, 25) (1, 26)

371

OP ')' (1, 26) (1, 27)

372

OP ':' (1, 27) (1, 28)

373

NAME 'pass' (1, 29) (1, 33)

374

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

375

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

376

NAME 'def' (1, 0) (1, 3)

377

NAME 'd01v_' (1, 4) (1, 9)

378

OP '(' (1, 9) (1, 10)

379

NAME 'a' (1, 10) (1, 11)

380

OP '=' (1, 11) (1, 12)

381

NUMBER '1' (1, 12) (1, 13)

382

OP ',' (1, 13) (1, 14)

383

OP '*' (1, 15) (1, 16)

384

NAME 'k' (1, 16) (1, 17)

385

OP ',' (1, 17) (1, 18)

386

OP '**' (1, 19) (1, 21)

387

NAME 'w' (1, 21) (1, 22)

388

OP ')' (1, 22) (1, 23)

389

OP ':' (1, 23) (1, 24)

390

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

395

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

396

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

397

NAME 'if' (1, 0) (1, 2)

398

NUMBER '1' (1, 3) (1, 4)

399

OP '<' (1, 5) (1, 6)

400

NUMBER '1' (1, 7) (1, 8)

401

OP '>' (1, 9) (1, 10)

402

NUMBER '1' (1, 11) (1, 12)

403

OP '==' (1, 13) (1, 15)

404

NUMBER '1' (1, 16) (1, 17)

405

OP '>=' (1, 18) (1, 20)

406

NUMBER '5' (1, 21) (1, 22)

407

OP '<=' (1, 23) (1, 25)

408

NUMBER '0x15' (1, 26) (1, 30)

409

OP '<=' (1, 31) (1, 33)

410

NUMBER '0x12' (1, 34) (1, 38)

411

OP '!=' (1, 39) (1, 41)

412

NUMBER '1' (1, 42) (1, 43)

413

NAME 'and' (1, 44) (1, 47)

414

NUMBER '5' (1, 48) (1, 49)

415

NAME 'in' (1, 50) (1, 52)

416

NUMBER '1' (1, 53) (1, 54)

417

NAME 'not' (1, 55) (1, 58)

418

NAME 'in' (1, 59) (1, 61)

419

NUMBER '1' (1, 62) (1, 63)

420

NAME 'is' (1, 64) (1, 66)

421

NUMBER '1' (1, 67) (1, 68)

422

NAME 'or' (1, 69) (1, 71)

423

NUMBER '5' (1, 72) (1, 73)

424

NAME 'is' (1, 74) (1, 76)

425

NAME 'not' (1, 77) (1, 80)

426

NUMBER '1' (1, 81) (1, 82)

427

OP ':' (1, 82) (1, 83)

428

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

433

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

434

NAME 'x' (1, 0) (1, 1)

435

OP '=' (1, 2) (1, 3)

436

NUMBER '1' (1, 4) (1, 5)

437

OP '<<' (1, 6) (1, 8)

438

NUMBER '1' (1, 9) (1, 10)

439

OP '>>' (1, 11) (1, 13)

440

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

444

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

445

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

446

NAME 'x' (1, 0) (1, 1)

447

OP '=' (1, 2) (1, 3)

448

NUMBER '1' (1, 4) (1, 5)

449

OP '-' (1, 6) (1, 7)

450

NAME 'y' (1, 8) (1, 9)

451

OP '+' (1, 10) (1, 11)

452

NUMBER '15' (1, 12) (1, 14)

453

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

454

NUMBER '1' (1, 17) (1, 18)

455

OP '+' (1, 19) (1, 20)

456

NUMBER '0x124' (1, 21) (1, 26)

457

OP '+' (1, 27) (1, 28)

458

NAME 'z' (1, 29) (1, 30)

459

OP '+' (1, 31) (1, 32)

460

NAME 'a' (1, 33) (1, 34)

461

OP '[' (1, 34) (1, 35)

462

NUMBER '5' (1, 35) (1, 36)

463

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

468

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

469

NAME 'x' (1, 0) (1, 1)

470

OP '=' (1, 2) (1, 3)

471

NUMBER '1' (1, 4) (1, 5)

472

OP '//' (1, 5) (1, 7)

473

NUMBER '1' (1, 7) (1, 8)

474

OP '*' (1, 8) (1, 9)

475

NUMBER '1' (1, 9) (1, 10)

476

OP '/' (1, 10) (1, 11)

477

NUMBER '5' (1, 11) (1, 12)

478

OP '*' (1, 12) (1, 13)

479

NUMBER '12' (1, 13) (1, 15)

480

OP '%' (1, 15) (1, 16)

481

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

486

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

487

OP '~' (1, 0) (1, 1)

488

NUMBER '1' (1, 1) (1, 2)

489

OP '^' (1, 3) (1, 4)

490

NUMBER '1' (1, 5) (1, 6)

491

OP '&' (1, 7) (1, 8)

492

NUMBER '1' (1, 9) (1, 10)

493

OP '|' (1, 11) (1, 12)

494

NUMBER '1' (1, 12) (1, 13)

495

OP '^' (1, 14) (1, 15)

496

OP '-' (1, 16) (1, 17)

497

NUMBER '1' (1, 17) (1, 18)

498

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

499

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

500

OP '-' (1, 0) (1, 1)

501

NUMBER '1' (1, 1) (1, 2)

502

OP '*' (1, 2) (1, 3)

503

NUMBER '1' (1, 3) (1, 4)

504

OP '/' (1, 4) (1, 5)

505

NUMBER '1' (1, 5) (1, 6)

506

OP '+' (1, 6) (1, 7)

507

NUMBER '1' (1, 7) (1, 8)

508

OP '*' (1, 8) (1, 9)

509

NUMBER '1' (1, 9) (1, 10)

510

OP '//' (1, 10) (1, 12)

511

NUMBER '1' (1, 12) (1, 13)

512

OP '-' (1, 14) (1, 15)

513

OP '-' (1, 16) (1, 17)

514

OP '-' (1, 17) (1, 18)

515

OP '-' (1, 18) (1, 19)

516

NUMBER '1' (1, 19) (1, 20)

517

OP '**' (1, 20) (1, 22)

518

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

523

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

524

NAME 'import' (1, 0) (1, 6)

525

NAME 'sys' (1, 7) (1, 10)

526

OP ',' (1, 10) (1, 11)

527

NAME 'time' (1, 12) (1, 16)

528

NEWLINE '\\n' (1, 16) (1, 17)

529

NAME 'x' (2, 0) (2, 1)

530

OP '=' (2, 2) (2, 3)

531

NAME 'sys' (2, 4) (2, 7)

532

OP '.' (2, 7) (2, 8)

533

NAME 'modules' (2, 8) (2, 15)

534

OP '[' (2, 15) (2, 16)

535

STRING "'time'" (2, 16) (2, 22)

536

OP ']' (2, 22) (2, 23)

537

OP '.' (2, 23) (2, 24)

538

NAME 'time' (2, 24) (2, 28)

539

OP '(' (2, 28) (2, 29)

540

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

545

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

546

OP '@' (1, 0) (1, 1)

547

NAME 'staticmethod (1, 1) (1, 13)

548

NEWLINE '\\n' (1, 13) (1, 14)

549

NAME 'def' (2, 0) (2, 3)

550

NAME 'foo' (2, 4) (2, 7)

551

OP '(' (2, 7) (2, 8)

552

NAME 'x' (2, 8) (2, 9)

553

OP ',' (2, 9) (2, 10)

554

NAME 'y' (2, 10) (2, 11)

555

OP ')' (2, 11) (2, 12)

556

OP ':' (2, 12) (2, 13)

557

NAME 'pass' (2, 14) (2, 18)

558

559

Backslash means line continuation, except for comments

560

561

>>> roundtrip("x=1+\\\\n"

562

... "1\\n"

563

... "# This is a comment\\\\n"

564

... "# This also\\n")

565

True

566

>>> roundtrip("# Comment \\\\nx = 0")

567

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

568

569

Two string literals on the same line

570

571

>>> roundtrip("'' ''")

572

True

573

574

Test roundtrip on random python modules.

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

575

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

576

577

>>> import random

578

>>> tempdir = os.path.dirname(f) or os.curdir

579

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

580

Benjamin Peterson

963e402

2011-08-13 00:33:21 -0500

[diff] [blame]

581

tokenize is broken on test_pep3131.py because regular expressions are broken on

582

the obscure unicode identifiers in it. *sigh*

583

>>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

584

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

585

... testfiles = random.sample(testfiles, 10)

586

...

587

>>> for testfile in testfiles:

588

... if not roundtrip(open(testfile, 'rb')):

589

... print("Roundtrip failed for file %s" % testfile)

590

... break

591

... else: True

592

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

593

594

Evil tabs

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

595

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

596

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

597

ENCODING 'utf-8' (0, 0) (0, 0)

598

NAME 'def' (1, 0) (1, 3)

599

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

604

INDENT '\\t' (2, 0) (2, 1)

605

NAME 'if' (2, 1) (2, 3)

606

NAME 'x' (2, 4) (2, 5)

607

NEWLINE '\\n' (2, 5) (2, 6)

608

INDENT ' \\t' (3, 0) (3, 9)

609

NAME 'pass' (3, 9) (3, 13)

610

DEDENT '' (4, 0) (4, 0)

611

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

612

613

Non-ascii identifiers

614

615

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

616

ENCODING 'utf-8' (0, 0) (0, 0)

617

NAME 'Örter' (1, 0) (1, 5)

618

OP '=' (1, 6) (1, 7)

619

STRING "'places'" (1, 8) (1, 16)

620

NEWLINE '\\n' (1, 16) (1, 17)

621

NAME 'grün' (2, 0) (2, 4)

622

OP '=' (2, 5) (2, 6)

623

STRING "'green'" (2, 7) (2, 14)

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

624

625

Legacy unicode literals:

626

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

627

>>> dump_tokens("Örter = u'places'\\ngrün = U'green'")

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

628

ENCODING 'utf-8' (0, 0) (0, 0)

629

NAME 'Örter' (1, 0) (1, 5)

630

OP '=' (1, 6) (1, 7)

631

STRING "u'places'" (1, 8) (1, 17)

632

NEWLINE '\\n' (1, 17) (1, 18)

633

NAME 'grün' (2, 0) (2, 4)

634

OP '=' (2, 5) (2, 6)

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

635

STRING "U'green'" (2, 7) (2, 15)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

636

"""

637

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

638

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

639

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

640

STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

641

open as tokenize_open)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

642

from io import BytesIO

643

from unittest import TestCase

644

import os, sys, glob

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

645

import token

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

646

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

647

def dump_tokens(s):

648

"""Print out the tokens in s in a table format.

649

650

The ENDMARKER is omitted.

651

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

652

f = BytesIO(s.encode('utf-8'))

653

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

654

if type == ENDMARKER:

655

break

656

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

657

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

658

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

659

def roundtrip(f):

660

"""

661

Test roundtrip for `untokenize`. `f` is an open file or a string.

662

The source code in f is tokenized, converted back to source code via

663

tokenize.untokenize(), and tokenized again from the latter. The test

664

fails if the second tokenization doesn't match the first.

665

"""

666

if isinstance(f, str):

667

f = BytesIO(f.encode('utf-8'))

Brian Curtin

9f5f65c

2010-10-30 21:35:28 +0000

[diff] [blame]

668

try:

669

token_list = list(tokenize(f.readline))

670

finally:

671

f.close()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

672

tokens1 = [tok[:2] for tok in token_list]

673

new_bytes = untokenize(tokens1)

Ezio Melotti

d8b509b

2011-09-28 17:37:55 +0300

[diff] [blame]

674

readline = (line for line in new_bytes.splitlines(keepends=True)).__next__

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

675

tokens2 = [tok[:2] for tok in tokenize(readline)]

676

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

677

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

678

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

679

def decistmt(s):

680

"""Substitute Decimals for floats in a string of statements.

681

682

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

683

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

684

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

685

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

686

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

687

The format of the exponent is inherited from the platform C library.

688

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

689

we're only showing 11 digits, and the 12th isn't close to 5, the

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

690

rest of the output should be platform-independent.

691

692

>>> exec(s) #doctest: +ELLIPSIS

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

693

-3.2171603427...e-0...7

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

694

695

Output from calculations with Decimal should be identical across all

696

platforms.

697

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

698

>>> exec(decistmt(s))

699

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

700

"""

701

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

702

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

703

for toknum, tokval, _, _, _ in g:

704

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

713

return untokenize(result).decode('utf-8')

714

715

716

class TestTokenizerAdheresToPep0263(TestCase):

717

"""

718

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

719

"""

720

721

def _testFile(self, filename):

722

path = os.path.join(os.path.dirname(__file__), filename)

723

return roundtrip(open(path, 'rb'))

724

725

def test_utf8_coding_cookie_and_no_utf8_bom(self):

Ned Deily

2ea6fcc

2011-07-19 16:15:27 -0700

[diff] [blame]

726

f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

727

self.assertTrue(self._testFile(f))

728

729

def test_latin1_coding_cookie_and_utf8_bom(self):

730

"""

731

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

732

allowed encoding for the comment is 'utf-8'. The text file used in

733

this test starts with a BOM signature, but specifies latin1 as the

734

coding, so verify that a SyntaxError is raised, which matches the

735

behaviour of the interpreter when it encounters a similar condition.

736

"""

737

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

738

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

739

740

def test_no_coding_cookie_and_utf8_bom(self):

741

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

742

self.assertTrue(self._testFile(f))

743

744

def test_utf8_coding_cookie_and_utf8_bom(self):

745

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

746

self.assertTrue(self._testFile(f))

747

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

748

def test_bad_coding_cookie(self):

749

self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')

750

self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')

751

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

752

753

class Test_Tokenize(TestCase):

754

755

def test__tokenize_decodes_with_specified_encoding(self):

756

literal = '"ЉЊЈЁЂ"'

757

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

768

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

769

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

770

self.assertEqual(tokens, expected_tokens,

771

"bytes not decoded with encoding")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

772

773

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

786

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

787

self.assertEqual(tokens, expected_tokens,

788

"string not tokenized when encoding is None")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

789

790

791

class TestDetectEncoding(TestCase):

792

793

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

805

lines = (

806

b'# something\n',

807

b'print(something)\n',

808

b'do_something(else)\n'

809

)

810

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

811

self.assertEqual(encoding, 'utf-8')

812

self.assertEqual(consumed_lines, list(lines[:2]))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

813

814

def test_bom_no_cookie(self):

815

lines = (

816

b'\xef\xbb\xbf# something\n',

817

b'print(something)\n',

818

b'do_something(else)\n'

819

)

820

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

821

self.assertEqual(encoding, 'utf-8-sig')

822

self.assertEqual(consumed_lines,

823

[b'# something\n', b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

824

825

def test_cookie_first_line_no_bom(self):

826

lines = (

827

b'# -*- coding: latin-1 -*-\n',

828

b'print(something)\n',

829

b'do_something(else)\n'

830

)

831

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

832

self.assertEqual(encoding, 'iso-8859-1')

833

self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

834

835

def test_matched_bom_and_cookie_first_line(self):

836

lines = (

837

b'\xef\xbb\xbf# coding=utf-8\n',

838

b'print(something)\n',

839

b'do_something(else)\n'

840

)

841

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

842

self.assertEqual(encoding, 'utf-8-sig')

843

self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

844

845

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

846

lines = (

847

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

848

b'print(something)\n',

849

b'do_something(else)\n'

850

)

851

readline = self.get_readline(lines)

852

self.assertRaises(SyntaxError, detect_encoding, readline)

853

854

def test_cookie_second_line_no_bom(self):

855

lines = (

856

b'#! something\n',

857

b'# vim: set fileencoding=ascii :\n',

858

b'print(something)\n',

859

b'do_something(else)\n'

860

)

861

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

862

self.assertEqual(encoding, 'ascii')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

863

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

864

self.assertEqual(consumed_lines, expected)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

865

866

def test_matched_bom_and_cookie_second_line(self):

867

lines = (

868

b'\xef\xbb\xbf#! something\n',

869

b'f# coding=utf-8\n',

870

b'print(something)\n',

871

b'do_something(else)\n'

872

)

873

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

874

self.assertEqual(encoding, 'utf-8-sig')

875

self.assertEqual(consumed_lines,

876

[b'#! something\n', b'f# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

877

878

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

879

lines = (

880

b'\xef\xbb\xbf#! something\n',

881

b'# vim: set fileencoding=ascii :\n',

882

b'print(something)\n',

883

b'do_something(else)\n'

884

)

885

readline = self.get_readline(lines)

886

self.assertRaises(SyntaxError, detect_encoding, readline)

887

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

888

def test_latin1_normalization(self):

889

# See get_normal_name() in tokenizer.c.

890

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

891

"iso-8859-1-unix", "iso-latin-1-mac")

892

for encoding in encodings:

893

for rep in ("-", "_"):

894

enc = encoding.replace("-", rep)

895

lines = (b"#!/usr/bin/python\n",

896

b"# coding: " + enc.encode("ascii") + b"\n",

897

b"print(things)\n",

898

b"do_something += 4\n")

899

rl = self.get_readline(lines)

900

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

901

self.assertEqual(found, "iso-8859-1")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

902

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

903

def test_syntaxerror_latin1(self):

904

# Issue 14629: need to raise SyntaxError if the first

905

# line(s) have non-UTF-8 characters

906

lines = (

907

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

908

)

909

readline = self.get_readline(lines)

910

self.assertRaises(SyntaxError, detect_encoding, readline)

911

912

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

913

def test_utf8_normalization(self):

914

# See get_normal_name() in tokenizer.c.

915

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

916

for encoding in encodings:

917

for rep in ("-", "_"):

918

enc = encoding.replace("-", rep)

919

lines = (b"#!/usr/bin/python\n",

920

b"# coding: " + enc.encode("ascii") + b"\n",

921

b"1 + 3\n")

922

rl = self.get_readline(lines)

923

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

924

self.assertEqual(found, "utf-8")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

925

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

926

def test_short_files(self):

927

readline = self.get_readline((b'print(something)\n',))

928

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

929

self.assertEqual(encoding, 'utf-8')

930

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

931

932

encoding, consumed_lines = detect_encoding(self.get_readline(()))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

933

self.assertEqual(encoding, 'utf-8')

934

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

935

936

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

937

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

938

self.assertEqual(encoding, 'utf-8-sig')

939

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

940

941

readline = self.get_readline((b'\xef\xbb\xbf',))

942

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

943

self.assertEqual(encoding, 'utf-8-sig')

944

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

945

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

946

readline = self.get_readline((b'# coding: bad\n',))

947

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

948

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

949

def test_false_encoding(self):

950

# Issue 18873: "Encoding" detected in non-comment lines

951

readline = self.get_readline((b'print("#coding=fake")',))

952

encoding, consumed_lines = detect_encoding(readline)

953

self.assertEqual(encoding, 'utf-8')

954

self.assertEqual(consumed_lines, [b'print("#coding=fake")'])

955

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

956

def test_open(self):

957

filename = support.TESTFN + '.py'

958

self.addCleanup(support.unlink, filename)

959

960

# test coding cookie

961

for encoding in ('iso-8859-15', 'utf-8'):

962

with open(filename, 'w', encoding=encoding) as fp:

963

print("# coding: %s" % encoding, file=fp)

964

print("print('euro:\u20ac')", file=fp)

965

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

966

self.assertEqual(fp.encoding, encoding)

967

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

968

969

# test BOM (no coding cookie)

970

with open(filename, 'w', encoding='utf-8-sig') as fp:

971

print("print('euro:\u20ac')", file=fp)

972

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

973

self.assertEqual(fp.encoding, 'utf-8-sig')

974

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

975

Brett Cannon

c33f3f2

2012-04-20 13:23:54 -0400

[diff] [blame]

976

def test_filename_in_exception(self):

977

# When possible, include the file name in the exception.

978

path = 'some_file_path'

979

lines = (

980

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

981

)

982

class Bunk:

983

def __init__(self, lines, path):

self.name = path

self._lines = lines

self._index = 0

def readline(self):

if self._index == len(lines):

990

raise StopIteration

991

line = lines[self._index]

self._index += 1

return line

with self.assertRaises(SyntaxError):

996

ins = Bunk(lines, path)

997

# Make sure lacking a name isn't an issue.

998

del ins.name

999

detect_encoding(ins.readline)

1000

with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):

1001

ins = Bunk(lines, path)

1002

detect_encoding(ins.readline)

1003

1004

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1005

class TestTokenize(TestCase):

1006

1007

def test_tokenize(self):

1008

import tokenize as tokenize_module

1009

encoding = object()

1010

encoding_used = None

1011

def mock_detect_encoding(readline):

1012

return encoding, ['first', 'second']

1013

1014

def mock__tokenize(readline, encoding):

1015

nonlocal encoding_used

1016

encoding_used = encoding

1017

out = []

1018

while True:

1019

next_line = readline()

1020

if next_line:

1021

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

1034

orig__tokenize = tokenize_module._tokenize

1035

tokenize_module.detect_encoding = mock_detect_encoding

1036

tokenize_module._tokenize = mock__tokenize

1037

try:

1038

results = tokenize(mock_readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1039

self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1040

finally:

1041

tokenize_module.detect_encoding = orig_detect_encoding

1042

tokenize_module._tokenize = orig__tokenize

1043

1044

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

1045

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

1046

def assertExactTypeEqual(self, opstr, *optypes):

1047

tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))

1048

num_optypes = len(optypes)

1049

self.assertEqual(len(tokens), 2 + num_optypes)

1050

self.assertEqual(token.tok_name[tokens[0].exact_type],

1051

token.tok_name[ENCODING])

1052

for i in range(num_optypes):

1053

self.assertEqual(token.tok_name[tokens[i + 1].exact_type],

1054

token.tok_name[optypes[i]])

1055

self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],

1056

token.tok_name[token.ENDMARKER])

1057

1058

def test_exact_type(self):

1059

self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

1060

self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)

1061

self.assertExactTypeEqual(':', token.COLON)

1062

self.assertExactTypeEqual(',', token.COMMA)

1063

self.assertExactTypeEqual(';', token.SEMI)

1064

self.assertExactTypeEqual('+', token.PLUS)

1065

self.assertExactTypeEqual('-', token.MINUS)

1066

self.assertExactTypeEqual('*', token.STAR)

1067

self.assertExactTypeEqual('/', token.SLASH)

1068

self.assertExactTypeEqual('|', token.VBAR)

1069

self.assertExactTypeEqual('&', token.AMPER)

1070

self.assertExactTypeEqual('<', token.LESS)

1071

self.assertExactTypeEqual('>', token.GREATER)

1072

self.assertExactTypeEqual('=', token.EQUAL)

1073

self.assertExactTypeEqual('.', token.DOT)

1074

self.assertExactTypeEqual('%', token.PERCENT)

1075

self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)

1076

self.assertExactTypeEqual('==', token.EQEQUAL)

1077

self.assertExactTypeEqual('!=', token.NOTEQUAL)

1078

self.assertExactTypeEqual('<=', token.LESSEQUAL)

1079

self.assertExactTypeEqual('>=', token.GREATEREQUAL)

1080

self.assertExactTypeEqual('~', token.TILDE)

1081

self.assertExactTypeEqual('^', token.CIRCUMFLEX)

1082

self.assertExactTypeEqual('<<', token.LEFTSHIFT)

1083

self.assertExactTypeEqual('>>', token.RIGHTSHIFT)

1084

self.assertExactTypeEqual('**', token.DOUBLESTAR)

1085

self.assertExactTypeEqual('+=', token.PLUSEQUAL)

1086

self.assertExactTypeEqual('-=', token.MINEQUAL)

1087

self.assertExactTypeEqual('*=', token.STAREQUAL)

1088

self.assertExactTypeEqual('/=', token.SLASHEQUAL)

1089

self.assertExactTypeEqual('%=', token.PERCENTEQUAL)

1090

self.assertExactTypeEqual('&=', token.AMPEREQUAL)

1091

self.assertExactTypeEqual('|=', token.VBAREQUAL)

1092

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1093

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1094

self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)

1095

self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)

1096

self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)

1097

self.assertExactTypeEqual('//', token.DOUBLESLASH)

1098

self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)

1099

self.assertExactTypeEqual('@', token.AT)

1100

1101

self.assertExactTypeEqual('a**2+b**2==c**2',

1102

NAME, token.DOUBLESTAR, NUMBER,

1103

token.PLUS,

1104

NAME, token.DOUBLESTAR, NUMBER,

1105

token.EQEQUAL,

1106

NAME, token.DOUBLESTAR, NUMBER)

1107

self.assertExactTypeEqual('{1, 2, 3}',

1108

token.LBRACE,

1109

token.NUMBER, token.COMMA,

1110

token.NUMBER, token.COMMA,

1111

token.NUMBER,

1112

token.RBRACE)

1113

self.assertExactTypeEqual('^(x & 0x1)',

1114

token.CIRCUMFLEX,

1115

token.LPAR,

1116

token.NAME, token.AMPER, token.NUMBER,

1117

token.RPAR)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1118

Ezio Melotti

fafa8b7

2012-11-03 17:46:51 +0200

[diff] [blame]

1119

def test_pathological_trailing_whitespace(self):

1120

# See http://bugs.python.org/issue16152

1121

self.assertExactTypeEqual('@ ', token.AT)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1122

1123

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

1124

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

1125

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1126

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

1127

support.run_doctest(test_tokenize, True)

1128

support.run_unittest(TestTokenizerAdheresToPep0263)

1129

support.run_unittest(Test_Tokenize)

1130

support.run_unittest(TestDetectEncoding)

1131

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

1132

Thomas Wouters