Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-16 00:07:10 +0000

[diff] [blame]

1

doctests = """

2

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

3

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

4

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

5

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

6

brevity.

7

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

8

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

9

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

NUMBER '1' (1, 0) (1, 1)

11

OP '+' (1, 2) (1, 3)

12

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

13

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

14

>>> dump_tokens("if False:\\n"

15

... " # NL\\n"

16

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

17

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

18

NAME 'if' (1, 0) (1, 2)

19

NAME 'False' (1, 3) (1, 8)

20

OP ':' (1, 8) (1, 9)

21

NEWLINE '\\n' (1, 9) (1, 10)

22

COMMENT '# NL' (2, 4) (2, 8)

23

NL '\\n' (2, 8) (2, 9)

24

INDENT ' ' (3, 0) (3, 4)

25

NAME 'True' (3, 4) (3, 8)

26

OP '=' (3, 9) (3, 10)

27

NAME 'False' (3, 11) (3, 16)

28

COMMENT '# NEWLINE' (3, 17) (3, 26)

29

NEWLINE '\\n' (3, 26) (3, 27)

30

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

31

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

32

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

37

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

38

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

39

Traceback (most recent call last):

40

...

41

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

42

Mark Dickinson

3c0b317

2010-06-29 07:38:37 +0000

[diff] [blame]

43

There are some standard formatting practices that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

>>> roundtrip("if x == 1:\\n"

46

... " print(x)\\n")

47

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

48

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

49

>>> roundtrip("# This is a comment\\n# This also")

50

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

51

52

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

53

untokenize a little trickier. Note that this test involves trailing

54

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

55

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

56

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

57

>>> roundtrip("if x == 1 : \\n"

58

... " print(x)\\n")

59

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

60

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

61

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

62

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

63

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

64

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

>>> roundtrip("if x == 1:\\n"

66

... " # A comment by itself.\\n"

67

... " print(x) # Comment here, too.\\n"

68

... " # Another comment.\\n"

69

... "after_if = True\\n")

70

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

71

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

72

>>> roundtrip("if (x # The comments need to go in the right place\\n"

73

... " == 1):\\n"

74

... " print('x==1')\\n")

75

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

76

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

77

>>> roundtrip("class Test: # A comment here\\n"

78

... " # A comment with weird indent\\n"

79

... " after_com = 5\\n"

80

... " def x(m): return m*5 # a one liner\\n"

81

... " def y(m): # A whitespace after the colon\\n"

82

... " return y*4 # 3-space indent\\n")

83

True

84

85

Some error-handling code

86

87

>>> roundtrip("try: import somemodule\\n"

88

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

89

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

90

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

91

True

92

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

93

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

94

95

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

100

... "'b':15, 'c':True}\\n"

101

... "x = len(y) + 5 - a[\\n"

102

... "3] - a[2]\\n"

103

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

108

109

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

110

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

111

NUMBER '0xff' (1, 0) (1, 4)

112

OP '<=' (1, 5) (1, 7)

113

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

114

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

115

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

NUMBER '0b10' (1, 0) (1, 4)

117

OP '<=' (1, 5) (1, 7)

118

NUMBER '255' (1, 8) (1, 11)

119

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

120

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

121

NUMBER '0o123' (1, 0) (1, 5)

122

OP '<=' (1, 6) (1, 8)

123

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

124

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

125

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

NUMBER '1234567' (1, 0) (1, 7)

127

OP '>' (1, 8) (1, 9)

128

OP '~' (1, 10) (1, 11)

129

NUMBER '0x15' (1, 11) (1, 15)

130

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

131

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

132

NUMBER '2134568' (1, 0) (1, 7)

133

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

134

NUMBER '1231515' (1, 11) (1, 18)

135

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

136

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

137

OP '(' (1, 0) (1, 1)

138

OP '-' (1, 1) (1, 2)

139

NUMBER '124561' (1, 2) (1, 8)

140

OP '-' (1, 8) (1, 9)

141

NUMBER '1' (1, 9) (1, 10)

142

OP ')' (1, 10) (1, 11)

143

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

144

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

145

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

146

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

NUMBER '0xdeadbeef' (1, 0) (1, 10)

148

OP '!=' (1, 11) (1, 13)

149

OP '-' (1, 14) (1, 15)

150

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

151

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

152

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

153

NUMBER '0xdeadc0de' (1, 0) (1, 10)

154

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

155

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

156

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

157

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

NUMBER '0xFF' (1, 0) (1, 4)

159

OP '&' (1, 5) (1, 6)

160

NUMBER '0x15' (1, 7) (1, 11)

161

OP '|' (1, 12) (1, 13)

162

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

166

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

167

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

168

NAME 'x' (1, 0) (1, 1)

169

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

170

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

171

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

172

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

NAME 'x' (1, 0) (1, 1)

174

OP '=' (1, 2) (1, 3)

175

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

176

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

177

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

178

NAME 'x' (1, 0) (1, 1)

179

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

180

NUMBER '123141242151 (1, 4) (1, 25)

181

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

182

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

183

NAME 'x' (1, 0) (1, 1)

184

OP '=' (1, 2) (1, 3)

185

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

186

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

187

188

Floating point numbers

189

190

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

191

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

192

NAME 'x' (1, 0) (1, 1)

193

OP '=' (1, 2) (1, 3)

194

NUMBER '3.14159' (1, 4) (1, 11)

195

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

196

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

197

NAME 'x' (1, 0) (1, 1)

198

OP '=' (1, 2) (1, 3)

199

NUMBER '314159.' (1, 4) (1, 11)

200

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

201

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

202

NAME 'x' (1, 0) (1, 1)

203

OP '=' (1, 2) (1, 3)

204

NUMBER '.314159' (1, 4) (1, 11)

205

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

206

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

207

NAME 'x' (1, 0) (1, 1)

208

OP '=' (1, 2) (1, 3)

209

NUMBER '3e14159' (1, 4) (1, 11)

210

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

211

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

212

NAME 'x' (1, 0) (1, 1)

213

OP '=' (1, 2) (1, 3)

214

NUMBER '3E123' (1, 4) (1, 9)

215

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

216

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

217

NAME 'x' (1, 0) (1, 1)

218

OP '+' (1, 1) (1, 2)

219

NAME 'y' (1, 2) (1, 3)

220

OP '=' (1, 4) (1, 5)

221

NUMBER '3e-1230' (1, 6) (1, 13)

222

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

224

NAME 'x' (1, 0) (1, 1)

225

OP '=' (1, 2) (1, 3)

226

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

232

NAME 'x' (1, 0) (1, 1)

233

OP '=' (1, 2) (1, 3)

234

STRING "''" (1, 4) (1, 6)

235

OP ';' (1, 6) (1, 7)

236

NAME 'y' (1, 8) (1, 9)

237

OP '=' (1, 10) (1, 11)

238

STRING '""' (1, 12) (1, 14)

239

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

240

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

241

NAME 'x' (1, 0) (1, 1)

242

OP '=' (1, 2) (1, 3)

243

STRING '\\'"\\'' (1, 4) (1, 7)

244

OP ';' (1, 7) (1, 8)

245

NAME 'y' (1, 9) (1, 10)

246

OP '=' (1, 11) (1, 12)

247

STRING '"\\'"' (1, 13) (1, 16)

248

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

249

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

250

NAME 'x' (1, 0) (1, 1)

251

OP '=' (1, 2) (1, 3)

252

STRING '"doesn\\'t "' (1, 4) (1, 14)

253

NAME 'shrink' (1, 14) (1, 20)

254

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

255

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

256

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

257

NAME 'x' (1, 0) (1, 1)

258

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

259

STRING "'abc'" (1, 4) (1, 9)

260

OP '+' (1, 10) (1, 11)

261

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

262

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

263

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

NAME 'y' (1, 0) (1, 1)

265

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

266

STRING '"ABC"' (1, 4) (1, 9)

267

OP '+' (1, 10) (1, 11)

268

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

269

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

270

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

NAME 'x' (1, 0) (1, 1)

272

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

273

STRING "r'abc'" (1, 4) (1, 10)

274

OP '+' (1, 11) (1, 12)

275

STRING "r'ABC'" (1, 13) (1, 19)

276

OP '+' (1, 20) (1, 21)

277

STRING "R'ABC'" (1, 22) (1, 28)

278

OP '+' (1, 29) (1, 30)

279

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

280

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

281

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

NAME 'y' (1, 0) (1, 1)

283

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

284

STRING 'r"abc"' (1, 4) (1, 10)

285

OP '+' (1, 11) (1, 12)

286

STRING 'r"ABC"' (1, 13) (1, 19)

287

OP '+' (1, 20) (1, 21)

288

STRING 'R"ABC"' (1, 22) (1, 28)

289

OP '+' (1, 29) (1, 30)

290

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Operators

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

295

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

296

NAME 'def' (1, 0) (1, 3)

297

NAME 'd22' (1, 4) (1, 7)

298

OP '(' (1, 7) (1, 8)

299

NAME 'a' (1, 8) (1, 9)

300

OP ',' (1, 9) (1, 10)

301

NAME 'b' (1, 11) (1, 12)

302

OP ',' (1, 12) (1, 13)

303

NAME 'c' (1, 14) (1, 15)

304

OP '=' (1, 15) (1, 16)

305

NUMBER '2' (1, 16) (1, 17)

306

OP ',' (1, 17) (1, 18)

307

NAME 'd' (1, 19) (1, 20)

308

OP '=' (1, 20) (1, 21)

309

NUMBER '2' (1, 21) (1, 22)

310

OP ',' (1, 22) (1, 23)

311

OP '*' (1, 24) (1, 25)

312

NAME 'k' (1, 25) (1, 26)

313

OP ')' (1, 26) (1, 27)

314

OP ':' (1, 27) (1, 28)

315

NAME 'pass' (1, 29) (1, 33)

316

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

318

NAME 'def' (1, 0) (1, 3)

319

NAME 'd01v_' (1, 4) (1, 9)

320

OP '(' (1, 9) (1, 10)

321

NAME 'a' (1, 10) (1, 11)

322

OP '=' (1, 11) (1, 12)

323

NUMBER '1' (1, 12) (1, 13)

324

OP ',' (1, 13) (1, 14)

325

OP '*' (1, 15) (1, 16)

326

NAME 'k' (1, 16) (1, 17)

327

OP ',' (1, 17) (1, 18)

328

OP '**' (1, 19) (1, 21)

329

NAME 'w' (1, 21) (1, 22)

330

OP ')' (1, 22) (1, 23)

331

OP ':' (1, 23) (1, 24)

332

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

337

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

339

NAME 'if' (1, 0) (1, 2)

340

NUMBER '1' (1, 3) (1, 4)

341

OP '<' (1, 5) (1, 6)

342

NUMBER '1' (1, 7) (1, 8)

343

OP '>' (1, 9) (1, 10)

344

NUMBER '1' (1, 11) (1, 12)

345

OP '==' (1, 13) (1, 15)

346

NUMBER '1' (1, 16) (1, 17)

347

OP '>=' (1, 18) (1, 20)

348

NUMBER '5' (1, 21) (1, 22)

349

OP '<=' (1, 23) (1, 25)

350

NUMBER '0x15' (1, 26) (1, 30)

351

OP '<=' (1, 31) (1, 33)

352

NUMBER '0x12' (1, 34) (1, 38)

353

OP '!=' (1, 39) (1, 41)

354

NUMBER '1' (1, 42) (1, 43)

355

NAME 'and' (1, 44) (1, 47)

356

NUMBER '5' (1, 48) (1, 49)

357

NAME 'in' (1, 50) (1, 52)

358

NUMBER '1' (1, 53) (1, 54)

359

NAME 'not' (1, 55) (1, 58)

360

NAME 'in' (1, 59) (1, 61)

361

NUMBER '1' (1, 62) (1, 63)

362

NAME 'is' (1, 64) (1, 66)

363

NUMBER '1' (1, 67) (1, 68)

364

NAME 'or' (1, 69) (1, 71)

365

NUMBER '5' (1, 72) (1, 73)

366

NAME 'is' (1, 74) (1, 76)

367

NAME 'not' (1, 77) (1, 80)

368

NUMBER '1' (1, 81) (1, 82)

369

OP ':' (1, 82) (1, 83)

370

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

375

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

376

NAME 'x' (1, 0) (1, 1)

377

OP '=' (1, 2) (1, 3)

378

NUMBER '1' (1, 4) (1, 5)

379

OP '<<' (1, 6) (1, 8)

380

NUMBER '1' (1, 9) (1, 10)

381

OP '>>' (1, 11) (1, 13)

382

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

386

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

387

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

388

NAME 'x' (1, 0) (1, 1)

389

OP '=' (1, 2) (1, 3)

390

NUMBER '1' (1, 4) (1, 5)

391

OP '-' (1, 6) (1, 7)

392

NAME 'y' (1, 8) (1, 9)

393

OP '+' (1, 10) (1, 11)

394

NUMBER '15' (1, 12) (1, 14)

395

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

396

NUMBER '1' (1, 17) (1, 18)

397

OP '+' (1, 19) (1, 20)

398

NUMBER '0x124' (1, 21) (1, 26)

399

OP '+' (1, 27) (1, 28)

400

NAME 'z' (1, 29) (1, 30)

401

OP '+' (1, 31) (1, 32)

402

NAME 'a' (1, 33) (1, 34)

403

OP '[' (1, 34) (1, 35)

404

NUMBER '5' (1, 35) (1, 36)

405

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

410

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

411

NAME 'x' (1, 0) (1, 1)

412

OP '=' (1, 2) (1, 3)

413

NUMBER '1' (1, 4) (1, 5)

414

OP '//' (1, 5) (1, 7)

415

NUMBER '1' (1, 7) (1, 8)

416

OP '*' (1, 8) (1, 9)

417

NUMBER '1' (1, 9) (1, 10)

418

OP '/' (1, 10) (1, 11)

419

NUMBER '5' (1, 11) (1, 12)

420

OP '*' (1, 12) (1, 13)

421

NUMBER '12' (1, 13) (1, 15)

422

OP '%' (1, 15) (1, 16)

423

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

428

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

429

OP '~' (1, 0) (1, 1)

430

NUMBER '1' (1, 1) (1, 2)

431

OP '^' (1, 3) (1, 4)

432

NUMBER '1' (1, 5) (1, 6)

433

OP '&' (1, 7) (1, 8)

434

NUMBER '1' (1, 9) (1, 10)

435

OP '|' (1, 11) (1, 12)

436

NUMBER '1' (1, 12) (1, 13)

437

OP '^' (1, 14) (1, 15)

438

OP '-' (1, 16) (1, 17)

439

NUMBER '1' (1, 17) (1, 18)

440

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

441

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

442

OP '-' (1, 0) (1, 1)

443

NUMBER '1' (1, 1) (1, 2)

444

OP '*' (1, 2) (1, 3)

445

NUMBER '1' (1, 3) (1, 4)

446

OP '/' (1, 4) (1, 5)

447

NUMBER '1' (1, 5) (1, 6)

448

OP '+' (1, 6) (1, 7)

449

NUMBER '1' (1, 7) (1, 8)

450

OP '*' (1, 8) (1, 9)

451

NUMBER '1' (1, 9) (1, 10)

452

OP '//' (1, 10) (1, 12)

453

NUMBER '1' (1, 12) (1, 13)

454

OP '-' (1, 14) (1, 15)

455

OP '-' (1, 16) (1, 17)

456

OP '-' (1, 17) (1, 18)

457

OP '-' (1, 18) (1, 19)

458

NUMBER '1' (1, 19) (1, 20)

459

OP '**' (1, 20) (1, 22)

460

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

465

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

466

NAME 'import' (1, 0) (1, 6)

467

NAME 'sys' (1, 7) (1, 10)

468

OP ',' (1, 10) (1, 11)

469

NAME 'time' (1, 12) (1, 16)

470

NEWLINE '\\n' (1, 16) (1, 17)

471

NAME 'x' (2, 0) (2, 1)

472

OP '=' (2, 2) (2, 3)

473

NAME 'sys' (2, 4) (2, 7)

474

OP '.' (2, 7) (2, 8)

475

NAME 'modules' (2, 8) (2, 15)

476

OP '[' (2, 15) (2, 16)

477

STRING "'time'" (2, 16) (2, 22)

478

OP ']' (2, 22) (2, 23)

479

OP '.' (2, 23) (2, 24)

480

NAME 'time' (2, 24) (2, 28)

481

OP '(' (2, 28) (2, 29)

482

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

487

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

488

OP '@' (1, 0) (1, 1)

489

NAME 'staticmethod (1, 1) (1, 13)

490

NEWLINE '\\n' (1, 13) (1, 14)

491

NAME 'def' (2, 0) (2, 3)

492

NAME 'foo' (2, 4) (2, 7)

493

OP '(' (2, 7) (2, 8)

494

NAME 'x' (2, 8) (2, 9)

495

OP ',' (2, 9) (2, 10)

496

NAME 'y' (2, 10) (2, 11)

497

OP ')' (2, 11) (2, 12)

498

OP ':' (2, 12) (2, 13)

499

NAME 'pass' (2, 14) (2, 18)

500

501

Backslash means line continuation, except for comments

502

503

>>> roundtrip("x=1+\\\\n"

504

... "1\\n"

505

... "# This is a comment\\\\n"

506

... "# This also\\n")

507

True

508

>>> roundtrip("# Comment \\\\nx = 0")

509

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

510

511

Two string literals on the same line

512

513

>>> roundtrip("'' ''")

514

True

515

516

Test roundtrip on random python modules.

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

517

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

518

519

>>> import random

520

>>> tempdir = os.path.dirname(f) or os.curdir

521

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

522

Benjamin Peterson

963e402

2011-08-13 00:33:21 -0500

[diff] [blame]

523

tokenize is broken on test_pep3131.py because regular expressions are broken on

524

the obscure unicode identifiers in it. *sigh*

525

>>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

526

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

527

... testfiles = random.sample(testfiles, 10)

528

...

529

>>> for testfile in testfiles:

530

... if not roundtrip(open(testfile, 'rb')):

531

... print("Roundtrip failed for file %s" % testfile)

532

... break

533

... else: True

534

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

535

536

Evil tabs

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

537

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

538

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

539

ENCODING 'utf-8' (0, 0) (0, 0)

540

NAME 'def' (1, 0) (1, 3)

541

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

546

INDENT '\\t' (2, 0) (2, 1)

547

NAME 'if' (2, 1) (2, 3)

548

NAME 'x' (2, 4) (2, 5)

549

NEWLINE '\\n' (2, 5) (2, 6)

550

INDENT ' \\t' (3, 0) (3, 9)

551

NAME 'pass' (3, 9) (3, 13)

552

DEDENT '' (4, 0) (4, 0)

553

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

554

555

Non-ascii identifiers

556

557

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

558

ENCODING 'utf-8' (0, 0) (0, 0)

559

NAME 'Örter' (1, 0) (1, 5)

560

OP '=' (1, 6) (1, 7)

561

STRING "'places'" (1, 8) (1, 16)

562

NEWLINE '\\n' (1, 16) (1, 17)

563

NAME 'grün' (2, 0) (2, 4)

564

OP '=' (2, 5) (2, 6)

565

STRING "'green'" (2, 7) (2, 14)

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

566

567

Legacy unicode literals:

568

569

>>> dump_tokens("Örter = u'places'\\ngrün = UR'green'")

570

ENCODING 'utf-8' (0, 0) (0, 0)

571

NAME 'Örter' (1, 0) (1, 5)

572

OP '=' (1, 6) (1, 7)

573

STRING "u'places'" (1, 8) (1, 17)

574

NEWLINE '\\n' (1, 17) (1, 18)

575

NAME 'grün' (2, 0) (2, 4)

576

OP '=' (2, 5) (2, 6)

577

STRING "UR'green'" (2, 7) (2, 16)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

578

"""

579

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

580

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

581

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

582

STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

583

open as tokenize_open)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

584

from io import BytesIO

585

from unittest import TestCase

586

import os, sys, glob

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

587

import token

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

588

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

589

def dump_tokens(s):

590

"""Print out the tokens in s in a table format.

591

592

The ENDMARKER is omitted.

593

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

594

f = BytesIO(s.encode('utf-8'))

595

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

596

if type == ENDMARKER:

597

break

598

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

599

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

600

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

601

def roundtrip(f):

602

"""

603

Test roundtrip for `untokenize`. `f` is an open file or a string.

604

The source code in f is tokenized, converted back to source code via

605

tokenize.untokenize(), and tokenized again from the latter. The test

606

fails if the second tokenization doesn't match the first.

607

"""

608

if isinstance(f, str):

609

f = BytesIO(f.encode('utf-8'))

Brian Curtin

9f5f65c

2010-10-30 21:35:28 +0000

[diff] [blame]

610

try:

611

token_list = list(tokenize(f.readline))

612

finally:

613

f.close()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

614

tokens1 = [tok[:2] for tok in token_list]

615

new_bytes = untokenize(tokens1)

Ezio Melotti

d8b509b

2011-09-28 17:37:55 +0300

[diff] [blame]

616

readline = (line for line in new_bytes.splitlines(keepends=True)).__next__

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

617

tokens2 = [tok[:2] for tok in tokenize(readline)]

618

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

619

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

620

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

621

def decistmt(s):

622

"""Substitute Decimals for floats in a string of statements.

623

624

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

625

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

626

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

627

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

628

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

629

The format of the exponent is inherited from the platform C library.

630

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

631

we're only showing 11 digits, and the 12th isn't close to 5, the

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

632

rest of the output should be platform-independent.

633

634

>>> exec(s) #doctest: +ELLIPSIS

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

635

-3.2171603427...e-0...7

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

636

637

Output from calculations with Decimal should be identical across all

638

platforms.

639

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

640

>>> exec(decistmt(s))

641

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

642

"""

643

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

644

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

645

for toknum, tokval, _, _, _ in g:

646

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

655

return untokenize(result).decode('utf-8')

656

657

658

class TestTokenizerAdheresToPep0263(TestCase):

659

"""

660

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

661

"""

662

663

def _testFile(self, filename):

664

path = os.path.join(os.path.dirname(__file__), filename)

665

return roundtrip(open(path, 'rb'))

666

667

def test_utf8_coding_cookie_and_no_utf8_bom(self):

Ned Deily

2ea6fcc

2011-07-19 16:15:27 -0700

[diff] [blame]

668

f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

669

self.assertTrue(self._testFile(f))

670

671

def test_latin1_coding_cookie_and_utf8_bom(self):

672

"""

673

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

674

allowed encoding for the comment is 'utf-8'. The text file used in

675

this test starts with a BOM signature, but specifies latin1 as the

676

coding, so verify that a SyntaxError is raised, which matches the

677

behaviour of the interpreter when it encounters a similar condition.

678

"""

679

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

680

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

681

682

def test_no_coding_cookie_and_utf8_bom(self):

683

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

684

self.assertTrue(self._testFile(f))

685

686

def test_utf8_coding_cookie_and_utf8_bom(self):

687

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

688

self.assertTrue(self._testFile(f))

689

690

691

class Test_Tokenize(TestCase):

692

693

def test__tokenize_decodes_with_specified_encoding(self):

694

literal = '"ЉЊЈЁЂ"'

695

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

706

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

707

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

708

self.assertEqual(tokens, expected_tokens,

709

"bytes not decoded with encoding")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

710

711

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

724

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

725

self.assertEqual(tokens, expected_tokens,

726

"string not tokenized when encoding is None")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

727

728

729

class TestDetectEncoding(TestCase):

730

731

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

743

lines = (

744

b'# something\n',

745

b'print(something)\n',

746

b'do_something(else)\n'

747

)

748

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

749

self.assertEqual(encoding, 'utf-8')

750

self.assertEqual(consumed_lines, list(lines[:2]))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

751

752

def test_bom_no_cookie(self):

753

lines = (

754

b'\xef\xbb\xbf# something\n',

755

b'print(something)\n',

756

b'do_something(else)\n'

757

)

758

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

759

self.assertEqual(encoding, 'utf-8-sig')

760

self.assertEqual(consumed_lines,

761

[b'# something\n', b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

762

763

def test_cookie_first_line_no_bom(self):

764

lines = (

765

b'# -*- coding: latin-1 -*-\n',

766

b'print(something)\n',

767

b'do_something(else)\n'

768

)

769

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

770

self.assertEqual(encoding, 'iso-8859-1')

771

self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

772

773

def test_matched_bom_and_cookie_first_line(self):

774

lines = (

775

b'\xef\xbb\xbf# coding=utf-8\n',

776

b'print(something)\n',

777

b'do_something(else)\n'

778

)

779

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

780

self.assertEqual(encoding, 'utf-8-sig')

781

self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

782

783

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

784

lines = (

785

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

786

b'print(something)\n',

787

b'do_something(else)\n'

788

)

789

readline = self.get_readline(lines)

790

self.assertRaises(SyntaxError, detect_encoding, readline)

791

792

def test_cookie_second_line_no_bom(self):

793

lines = (

794

b'#! something\n',

795

b'# vim: set fileencoding=ascii :\n',

796

b'print(something)\n',

797

b'do_something(else)\n'

798

)

799

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

800

self.assertEqual(encoding, 'ascii')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

801

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

802

self.assertEqual(consumed_lines, expected)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

803

804

def test_matched_bom_and_cookie_second_line(self):

805

lines = (

806

b'\xef\xbb\xbf#! something\n',

807

b'f# coding=utf-8\n',

808

b'print(something)\n',

809

b'do_something(else)\n'

810

)

811

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

812

self.assertEqual(encoding, 'utf-8-sig')

813

self.assertEqual(consumed_lines,

814

[b'#! something\n', b'f# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

815

816

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

817

lines = (

818

b'\xef\xbb\xbf#! something\n',

819

b'# vim: set fileencoding=ascii :\n',

820

b'print(something)\n',

821

b'do_something(else)\n'

822

)

823

readline = self.get_readline(lines)

824

self.assertRaises(SyntaxError, detect_encoding, readline)

825

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

826

def test_latin1_normalization(self):

827

# See get_normal_name() in tokenizer.c.

828

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

829

"iso-8859-1-unix", "iso-latin-1-mac")

830

for encoding in encodings:

831

for rep in ("-", "_"):

832

enc = encoding.replace("-", rep)

833

lines = (b"#!/usr/bin/python\n",

834

b"# coding: " + enc.encode("ascii") + b"\n",

835

b"print(things)\n",

836

b"do_something += 4\n")

837

rl = self.get_readline(lines)

838

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

839

self.assertEqual(found, "iso-8859-1")

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

840

841

def test_utf8_normalization(self):

842

# See get_normal_name() in tokenizer.c.

843

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

844

for encoding in encodings:

845

for rep in ("-", "_"):

846

enc = encoding.replace("-", rep)

847

lines = (b"#!/usr/bin/python\n",

848

b"# coding: " + enc.encode("ascii") + b"\n",

849

b"1 + 3\n")

850

rl = self.get_readline(lines)

851

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

852

self.assertEqual(found, "utf-8")

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

853

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

854

def test_short_files(self):

855

readline = self.get_readline((b'print(something)\n',))

856

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

857

self.assertEqual(encoding, 'utf-8')

858

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

859

860

encoding, consumed_lines = detect_encoding(self.get_readline(()))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

861

self.assertEqual(encoding, 'utf-8')

862

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

863

864

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

865

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

866

self.assertEqual(encoding, 'utf-8-sig')

867

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

868

869

readline = self.get_readline((b'\xef\xbb\xbf',))

870

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

871

self.assertEqual(encoding, 'utf-8-sig')

872

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

873

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

874

readline = self.get_readline((b'# coding: bad\n',))

875

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

876

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

877

def test_open(self):

878

filename = support.TESTFN + '.py'

879

self.addCleanup(support.unlink, filename)

880

881

# test coding cookie

882

for encoding in ('iso-8859-15', 'utf-8'):

883

with open(filename, 'w', encoding=encoding) as fp:

884

print("# coding: %s" % encoding, file=fp)

885

print("print('euro:\u20ac')", file=fp)

886

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

887

self.assertEqual(fp.encoding, encoding)

888

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

889

890

# test BOM (no coding cookie)

891

with open(filename, 'w', encoding='utf-8-sig') as fp:

892

print("print('euro:\u20ac')", file=fp)

893

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

894

self.assertEqual(fp.encoding, 'utf-8-sig')

895

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

896

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

897

class TestTokenize(TestCase):

898

899

def test_tokenize(self):

900

import tokenize as tokenize_module

901

encoding = object()

902

encoding_used = None

903

def mock_detect_encoding(readline):

904

return encoding, ['first', 'second']

905

906

def mock__tokenize(readline, encoding):

907

nonlocal encoding_used

908

encoding_used = encoding

909

out = []

910

while True:

911

next_line = readline()

912

if next_line:

913

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

926

orig__tokenize = tokenize_module._tokenize

927

tokenize_module.detect_encoding = mock_detect_encoding

928

tokenize_module._tokenize = mock__tokenize

929

try:

930

results = tokenize(mock_readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

931

self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

932

finally:

933

tokenize_module.detect_encoding = orig_detect_encoding

934

tokenize_module._tokenize = orig__tokenize

935

936

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

937

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

938

def assertExactTypeEqual(self, opstr, *optypes):

939

tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))

940

num_optypes = len(optypes)

941

self.assertEqual(len(tokens), 2 + num_optypes)

942

self.assertEqual(token.tok_name[tokens[0].exact_type],

943

token.tok_name[ENCODING])

944

for i in range(num_optypes):

945

self.assertEqual(token.tok_name[tokens[i + 1].exact_type],

946

token.tok_name[optypes[i]])

947

self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],

948

token.tok_name[token.ENDMARKER])

949

950

def test_exact_type(self):

951

self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

952

self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)

953

self.assertExactTypeEqual(':', token.COLON)

954

self.assertExactTypeEqual(',', token.COMMA)

955

self.assertExactTypeEqual(';', token.SEMI)

956

self.assertExactTypeEqual('+', token.PLUS)

957

self.assertExactTypeEqual('-', token.MINUS)

958

self.assertExactTypeEqual('*', token.STAR)

959

self.assertExactTypeEqual('/', token.SLASH)

960

self.assertExactTypeEqual('|', token.VBAR)

961

self.assertExactTypeEqual('&', token.AMPER)

962

self.assertExactTypeEqual('<', token.LESS)

963

self.assertExactTypeEqual('>', token.GREATER)

964

self.assertExactTypeEqual('=', token.EQUAL)

965

self.assertExactTypeEqual('.', token.DOT)

966

self.assertExactTypeEqual('%', token.PERCENT)

967

self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)

968

self.assertExactTypeEqual('==', token.EQEQUAL)

969

self.assertExactTypeEqual('!=', token.NOTEQUAL)

970

self.assertExactTypeEqual('<=', token.LESSEQUAL)

971

self.assertExactTypeEqual('>=', token.GREATEREQUAL)

972

self.assertExactTypeEqual('~', token.TILDE)

973

self.assertExactTypeEqual('^', token.CIRCUMFLEX)

974

self.assertExactTypeEqual('<<', token.LEFTSHIFT)

975

self.assertExactTypeEqual('>>', token.RIGHTSHIFT)

976

self.assertExactTypeEqual('**', token.DOUBLESTAR)

977

self.assertExactTypeEqual('+=', token.PLUSEQUAL)

978

self.assertExactTypeEqual('-=', token.MINEQUAL)

979

self.assertExactTypeEqual('*=', token.STAREQUAL)

980

self.assertExactTypeEqual('/=', token.SLASHEQUAL)

981

self.assertExactTypeEqual('%=', token.PERCENTEQUAL)

982

self.assertExactTypeEqual('&=', token.AMPEREQUAL)

983

self.assertExactTypeEqual('|=', token.VBAREQUAL)

984

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

985

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

986

self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)

987

self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)

988

self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)

989

self.assertExactTypeEqual('//', token.DOUBLESLASH)

990

self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)

991

self.assertExactTypeEqual('@', token.AT)

992

993

self.assertExactTypeEqual('a**2+b**2==c**2',

994

NAME, token.DOUBLESTAR, NUMBER,

995

token.PLUS,

996

NAME, token.DOUBLESTAR, NUMBER,

997

token.EQEQUAL,

998

NAME, token.DOUBLESTAR, NUMBER)

999

self.assertExactTypeEqual('{1, 2, 3}',

1000

token.LBRACE,

1001

token.NUMBER, token.COMMA,

1002

token.NUMBER, token.COMMA,

1003

token.NUMBER,

1004

token.RBRACE)

1005

self.assertExactTypeEqual('^(x & 0x1)',

1006

token.CIRCUMFLEX,

1007

token.LPAR,

1008

token.NAME, token.AMPER, token.NUMBER,

1009

token.RPAR)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1010

1011

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

1012

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

1013

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1014

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

1015

support.run_doctest(test_tokenize, True)

1016

support.run_unittest(TestTokenizerAdheresToPep0263)

1017

support.run_unittest(Test_Tokenize)

1018

support.run_unittest(TestDetectEncoding)

1019

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

1020

Thomas Wouters