Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-18 22:41:35 +0000

[diff] [blame]

1

# -*- coding: utf-8 -*-

2

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

3

doctests = """

4

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

5

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

6

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

7

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

8

brevity.

9

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

11

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

12

NUMBER '1' (1, 0) (1, 1)

13

OP '+' (1, 2) (1, 3)

14

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

15

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

16

>>> dump_tokens("if False:\\n"

17

... " # NL\\n"

18

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

19

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

20

NAME 'if' (1, 0) (1, 2)

21

NAME 'False' (1, 3) (1, 8)

22

OP ':' (1, 8) (1, 9)

23

NEWLINE '\\n' (1, 9) (1, 10)

24

COMMENT '# NL' (2, 4) (2, 8)

25

NL '\\n' (2, 8) (2, 9)

26

INDENT ' ' (3, 0) (3, 4)

27

NAME 'True' (3, 4) (3, 8)

28

OP '=' (3, 9) (3, 10)

29

NAME 'False' (3, 11) (3, 16)

30

COMMENT '# NEWLINE' (3, 17) (3, 26)

31

NEWLINE '\\n' (3, 26) (3, 27)

32

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

33

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

34

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

39

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

40

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

41

Traceback (most recent call last):

42

...

43

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

There are some standard formattig practises that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

46

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

47

>>> roundtrip("if x == 1:\\n"

48

... " print(x)\\n")

49

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

50

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

51

>>> roundtrip("# This is a comment\\n# This also")

52

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

53

54

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

55

untokenize a little trickier. Note that this test involves trailing

56

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

57

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

58

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

59

>>> roundtrip("if x == 1 : \\n"

60

... " print(x)\\n")

61

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

62

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

63

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

64

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

66

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

67

>>> roundtrip("if x == 1:\\n"

68

... " # A comment by itself.\\n"

69

... " print(x) # Comment here, too.\\n"

70

... " # Another comment.\\n"

71

... "after_if = True\\n")

72

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

73

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

74

>>> roundtrip("if (x # The comments need to go in the right place\\n"

75

... " == 1):\\n"

76

... " print('x==1')\\n")

77

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

78

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

79

>>> roundtrip("class Test: # A comment here\\n"

80

... " # A comment with weird indent\\n"

81

... " after_com = 5\\n"

82

... " def x(m): return m*5 # a one liner\\n"

83

... " def y(m): # A whitespace after the colon\\n"

84

... " return y*4 # 3-space indent\\n")

85

True

86

87

Some error-handling code

88

89

>>> roundtrip("try: import somemodule\\n"

90

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

91

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

92

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

93

True

94

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

95

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

96

97

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

102

... "'b':15, 'c':True}\\n"

103

... "x = len(y) + 5 - a[\\n"

104

... "3] - a[2]\\n"

105

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

110

111

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

112

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

113

NUMBER '0xff' (1, 0) (1, 4)

114

OP '<=' (1, 5) (1, 7)

115

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

117

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

118

NUMBER '0b10' (1, 0) (1, 4)

119

OP '<=' (1, 5) (1, 7)

120

NUMBER '255' (1, 8) (1, 11)

121

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

122

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

123

NUMBER '0o123' (1, 0) (1, 5)

124

OP '<=' (1, 6) (1, 8)

125

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

127

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

128

NUMBER '1234567' (1, 0) (1, 7)

129

OP '>' (1, 8) (1, 9)

130

OP '~' (1, 10) (1, 11)

131

NUMBER '0x15' (1, 11) (1, 15)

132

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

133

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

134

NUMBER '2134568' (1, 0) (1, 7)

135

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

136

NUMBER '1231515' (1, 11) (1, 18)

137

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

138

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

139

OP '(' (1, 0) (1, 1)

140

OP '-' (1, 1) (1, 2)

141

NUMBER '124561' (1, 2) (1, 8)

142

OP '-' (1, 8) (1, 9)

143

NUMBER '1' (1, 9) (1, 10)

144

OP ')' (1, 10) (1, 11)

145

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

146

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

148

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

149

NUMBER '0xdeadbeef' (1, 0) (1, 10)

150

OP '!=' (1, 11) (1, 13)

151

OP '-' (1, 14) (1, 15)

152

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

153

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

154

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

155

NUMBER '0xdeadc0de' (1, 0) (1, 10)

156

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

157

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

159

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

160

NUMBER '0xFF' (1, 0) (1, 4)

161

OP '&' (1, 5) (1, 6)

162

NUMBER '0x15' (1, 7) (1, 11)

163

OP '|' (1, 12) (1, 13)

164

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

168

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

169

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

170

NAME 'x' (1, 0) (1, 1)

171

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

172

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

174

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

175

NAME 'x' (1, 0) (1, 1)

176

OP '=' (1, 2) (1, 3)

177

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

178

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

179

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

180

NAME 'x' (1, 0) (1, 1)

181

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

182

NUMBER '123141242151 (1, 4) (1, 25)

183

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

184

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

185

NAME 'x' (1, 0) (1, 1)

186

OP '=' (1, 2) (1, 3)

187

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

188

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

189

190

Floating point numbers

191

192

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

193

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

194

NAME 'x' (1, 0) (1, 1)

195

OP '=' (1, 2) (1, 3)

196

NUMBER '3.14159' (1, 4) (1, 11)

197

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

198

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

199

NAME 'x' (1, 0) (1, 1)

200

OP '=' (1, 2) (1, 3)

201

NUMBER '314159.' (1, 4) (1, 11)

202

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

203

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

204

NAME 'x' (1, 0) (1, 1)

205

OP '=' (1, 2) (1, 3)

206

NUMBER '.314159' (1, 4) (1, 11)

207

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

208

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

209

NAME 'x' (1, 0) (1, 1)

210

OP '=' (1, 2) (1, 3)

211

NUMBER '3e14159' (1, 4) (1, 11)

212

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

213

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

214

NAME 'x' (1, 0) (1, 1)

215

OP '=' (1, 2) (1, 3)

216

NUMBER '3E123' (1, 4) (1, 9)

217

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

218

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

219

NAME 'x' (1, 0) (1, 1)

220

OP '+' (1, 1) (1, 2)

221

NAME 'y' (1, 2) (1, 3)

222

OP '=' (1, 4) (1, 5)

223

NUMBER '3e-1230' (1, 6) (1, 13)

224

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

225

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

226

NAME 'x' (1, 0) (1, 1)

227

OP '=' (1, 2) (1, 3)

228

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

233

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

234

NAME 'x' (1, 0) (1, 1)

235

OP '=' (1, 2) (1, 3)

236

STRING "''" (1, 4) (1, 6)

237

OP ';' (1, 6) (1, 7)

238

NAME 'y' (1, 8) (1, 9)

239

OP '=' (1, 10) (1, 11)

240

STRING '""' (1, 12) (1, 14)

241

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

242

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

243

NAME 'x' (1, 0) (1, 1)

244

OP '=' (1, 2) (1, 3)

245

STRING '\\'"\\'' (1, 4) (1, 7)

246

OP ';' (1, 7) (1, 8)

247

NAME 'y' (1, 9) (1, 10)

248

OP '=' (1, 11) (1, 12)

249

STRING '"\\'"' (1, 13) (1, 16)

250

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

252

NAME 'x' (1, 0) (1, 1)

253

OP '=' (1, 2) (1, 3)

254

STRING '"doesn\\'t "' (1, 4) (1, 14)

255

NAME 'shrink' (1, 14) (1, 20)

256

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

257

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

258

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

259

NAME 'x' (1, 0) (1, 1)

260

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

261

STRING "'abc'" (1, 4) (1, 9)

262

OP '+' (1, 10) (1, 11)

263

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

265

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

266

NAME 'y' (1, 0) (1, 1)

267

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

268

STRING '"ABC"' (1, 4) (1, 9)

269

OP '+' (1, 10) (1, 11)

270

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

272

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

273

NAME 'x' (1, 0) (1, 1)

274

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

275

STRING "r'abc'" (1, 4) (1, 10)

276

OP '+' (1, 11) (1, 12)

277

STRING "r'ABC'" (1, 13) (1, 19)

278

OP '+' (1, 20) (1, 21)

279

STRING "R'ABC'" (1, 22) (1, 28)

280

OP '+' (1, 29) (1, 30)

281

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

284

NAME 'y' (1, 0) (1, 1)

285

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

286

STRING 'r"abc"' (1, 4) (1, 10)

287

OP '+' (1, 11) (1, 12)

288

STRING 'r"ABC"' (1, 13) (1, 19)

289

OP '+' (1, 20) (1, 21)

290

STRING 'R"ABC"' (1, 22) (1, 28)

291

OP '+' (1, 29) (1, 30)

292

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Operators

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

298

NAME 'def' (1, 0) (1, 3)

299

NAME 'd22' (1, 4) (1, 7)

300

OP '(' (1, 7) (1, 8)

301

NAME 'a' (1, 8) (1, 9)

302

OP ',' (1, 9) (1, 10)

303

NAME 'b' (1, 11) (1, 12)

304

OP ',' (1, 12) (1, 13)

305

NAME 'c' (1, 14) (1, 15)

306

OP '=' (1, 15) (1, 16)

307

NUMBER '2' (1, 16) (1, 17)

308

OP ',' (1, 17) (1, 18)

309

NAME 'd' (1, 19) (1, 20)

310

OP '=' (1, 20) (1, 21)

311

NUMBER '2' (1, 21) (1, 22)

312

OP ',' (1, 22) (1, 23)

313

OP '*' (1, 24) (1, 25)

314

NAME 'k' (1, 25) (1, 26)

315

OP ')' (1, 26) (1, 27)

316

OP ':' (1, 27) (1, 28)

317

NAME 'pass' (1, 29) (1, 33)

318

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

320

NAME 'def' (1, 0) (1, 3)

321

NAME 'd01v_' (1, 4) (1, 9)

322

OP '(' (1, 9) (1, 10)

323

NAME 'a' (1, 10) (1, 11)

324

OP '=' (1, 11) (1, 12)

325

NUMBER '1' (1, 12) (1, 13)

326

OP ',' (1, 13) (1, 14)

327

OP '*' (1, 15) (1, 16)

328

NAME 'k' (1, 16) (1, 17)

329

OP ',' (1, 17) (1, 18)

330

OP '**' (1, 19) (1, 21)

331

NAME 'w' (1, 21) (1, 22)

332

OP ')' (1, 22) (1, 23)

333

OP ':' (1, 23) (1, 24)

334

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

339

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

341

NAME 'if' (1, 0) (1, 2)

342

NUMBER '1' (1, 3) (1, 4)

343

OP '<' (1, 5) (1, 6)

344

NUMBER '1' (1, 7) (1, 8)

345

OP '>' (1, 9) (1, 10)

346

NUMBER '1' (1, 11) (1, 12)

347

OP '==' (1, 13) (1, 15)

348

NUMBER '1' (1, 16) (1, 17)

349

OP '>=' (1, 18) (1, 20)

350

NUMBER '5' (1, 21) (1, 22)

351

OP '<=' (1, 23) (1, 25)

352

NUMBER '0x15' (1, 26) (1, 30)

353

OP '<=' (1, 31) (1, 33)

354

NUMBER '0x12' (1, 34) (1, 38)

355

OP '!=' (1, 39) (1, 41)

356

NUMBER '1' (1, 42) (1, 43)

357

NAME 'and' (1, 44) (1, 47)

358

NUMBER '5' (1, 48) (1, 49)

359

NAME 'in' (1, 50) (1, 52)

360

NUMBER '1' (1, 53) (1, 54)

361

NAME 'not' (1, 55) (1, 58)

362

NAME 'in' (1, 59) (1, 61)

363

NUMBER '1' (1, 62) (1, 63)

364

NAME 'is' (1, 64) (1, 66)

365

NUMBER '1' (1, 67) (1, 68)

366

NAME 'or' (1, 69) (1, 71)

367

NUMBER '5' (1, 72) (1, 73)

368

NAME 'is' (1, 74) (1, 76)

369

NAME 'not' (1, 77) (1, 80)

370

NUMBER '1' (1, 81) (1, 82)

371

OP ':' (1, 82) (1, 83)

372

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

378

NAME 'x' (1, 0) (1, 1)

379

OP '=' (1, 2) (1, 3)

380

NUMBER '1' (1, 4) (1, 5)

381

OP '<<' (1, 6) (1, 8)

382

NUMBER '1' (1, 9) (1, 10)

383

OP '>>' (1, 11) (1, 13)

384

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

388

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

389

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

390

NAME 'x' (1, 0) (1, 1)

391

OP '=' (1, 2) (1, 3)

392

NUMBER '1' (1, 4) (1, 5)

393

OP '-' (1, 6) (1, 7)

394

NAME 'y' (1, 8) (1, 9)

395

OP '+' (1, 10) (1, 11)

396

NUMBER '15' (1, 12) (1, 14)

397

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

398

NUMBER '1' (1, 17) (1, 18)

399

OP '+' (1, 19) (1, 20)

400

NUMBER '0x124' (1, 21) (1, 26)

401

OP '+' (1, 27) (1, 28)

402

NAME 'z' (1, 29) (1, 30)

403

OP '+' (1, 31) (1, 32)

404

NAME 'a' (1, 33) (1, 34)

405

OP '[' (1, 34) (1, 35)

406

NUMBER '5' (1, 35) (1, 36)

407

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

412

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

413

NAME 'x' (1, 0) (1, 1)

414

OP '=' (1, 2) (1, 3)

415

NUMBER '1' (1, 4) (1, 5)

416

OP '//' (1, 5) (1, 7)

417

NUMBER '1' (1, 7) (1, 8)

418

OP '*' (1, 8) (1, 9)

419

NUMBER '1' (1, 9) (1, 10)

420

OP '/' (1, 10) (1, 11)

421

NUMBER '5' (1, 11) (1, 12)

422

OP '*' (1, 12) (1, 13)

423

NUMBER '12' (1, 13) (1, 15)

424

OP '%' (1, 15) (1, 16)

425

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

430

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

431

OP '~' (1, 0) (1, 1)

432

NUMBER '1' (1, 1) (1, 2)

433

OP '^' (1, 3) (1, 4)

434

NUMBER '1' (1, 5) (1, 6)

435

OP '&' (1, 7) (1, 8)

436

NUMBER '1' (1, 9) (1, 10)

437

OP '|' (1, 11) (1, 12)

438

NUMBER '1' (1, 12) (1, 13)

439

OP '^' (1, 14) (1, 15)

440

OP '-' (1, 16) (1, 17)

441

NUMBER '1' (1, 17) (1, 18)

442

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

443

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

444

OP '-' (1, 0) (1, 1)

445

NUMBER '1' (1, 1) (1, 2)

446

OP '*' (1, 2) (1, 3)

447

NUMBER '1' (1, 3) (1, 4)

448

OP '/' (1, 4) (1, 5)

449

NUMBER '1' (1, 5) (1, 6)

450

OP '+' (1, 6) (1, 7)

451

NUMBER '1' (1, 7) (1, 8)

452

OP '*' (1, 8) (1, 9)

453

NUMBER '1' (1, 9) (1, 10)

454

OP '//' (1, 10) (1, 12)

455

NUMBER '1' (1, 12) (1, 13)

456

OP '-' (1, 14) (1, 15)

457

OP '-' (1, 16) (1, 17)

458

OP '-' (1, 17) (1, 18)

459

OP '-' (1, 18) (1, 19)

460

NUMBER '1' (1, 19) (1, 20)

461

OP '**' (1, 20) (1, 22)

462

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

467

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

468

NAME 'import' (1, 0) (1, 6)

469

NAME 'sys' (1, 7) (1, 10)

470

OP ',' (1, 10) (1, 11)

471

NAME 'time' (1, 12) (1, 16)

472

NEWLINE '\\n' (1, 16) (1, 17)

473

NAME 'x' (2, 0) (2, 1)

474

OP '=' (2, 2) (2, 3)

475

NAME 'sys' (2, 4) (2, 7)

476

OP '.' (2, 7) (2, 8)

477

NAME 'modules' (2, 8) (2, 15)

478

OP '[' (2, 15) (2, 16)

479

STRING "'time'" (2, 16) (2, 22)

480

OP ']' (2, 22) (2, 23)

481

OP '.' (2, 23) (2, 24)

482

NAME 'time' (2, 24) (2, 28)

483

OP '(' (2, 28) (2, 29)

484

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

489

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

490

OP '@' (1, 0) (1, 1)

491

NAME 'staticmethod (1, 1) (1, 13)

492

NEWLINE '\\n' (1, 13) (1, 14)

493

NAME 'def' (2, 0) (2, 3)

494

NAME 'foo' (2, 4) (2, 7)

495

OP '(' (2, 7) (2, 8)

496

NAME 'x' (2, 8) (2, 9)

497

OP ',' (2, 9) (2, 10)

498

NAME 'y' (2, 10) (2, 11)

499

OP ')' (2, 11) (2, 12)

500

OP ':' (2, 12) (2, 13)

501

NAME 'pass' (2, 14) (2, 18)

502

503

Backslash means line continuation, except for comments

504

505

>>> roundtrip("x=1+\\\\n"

506

... "1\\n"

507

... "# This is a comment\\\\n"

508

... "# This also\\n")

509

True

510

>>> roundtrip("# Comment \\\\nx = 0")

511

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

512

513

Two string literals on the same line

514

515

>>> roundtrip("'' ''")

516

True

517

518

Test roundtrip on random python modules.

Antoine Pitrou

2731494

2010-10-14 15:41:23 +0000

[diff] [blame]

519

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

520

521

>>> import random

522

>>> tempdir = os.path.dirname(f) or os.curdir

523

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

524

Antoine Pitrou

2731494

2010-10-14 15:41:23 +0000

[diff] [blame]

525

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

526

... testfiles = random.sample(testfiles, 10)

527

...

528

>>> for testfile in testfiles:

529

... if not roundtrip(open(testfile, 'rb')):

530

... print("Roundtrip failed for file %s" % testfile)

531

... break

532

... else: True

533

True

Benjamin Peterson

8f6713f

2009-11-13 02:29:35 +0000

[diff] [blame]

534

535

Evil tabs

Benjamin Peterson

66428b2

2010-08-30 14:44:53 +0000

[diff] [blame]

536

Benjamin Peterson

8f6713f

2009-11-13 02:29:35 +0000

[diff] [blame]

537

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

538

ENCODING 'utf-8' (0, 0) (0, 0)

539

NAME 'def' (1, 0) (1, 3)

540

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

545

INDENT '\\t' (2, 0) (2, 1)

546

NAME 'if' (2, 1) (2, 3)

547

NAME 'x' (2, 4) (2, 5)

548

NEWLINE '\\n' (2, 5) (2, 6)

549

INDENT ' \\t' (3, 0) (3, 9)

550

NAME 'pass' (3, 9) (3, 13)

551

DEDENT '' (4, 0) (4, 0)

552

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

66428b2

2010-08-30 14:44:53 +0000

[diff] [blame]

553

554

Non-ascii identifiers

555

556

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

557

ENCODING 'utf-8' (0, 0) (0, 0)

558

NAME 'Örter' (1, 0) (1, 5)

559

OP '=' (1, 6) (1, 7)

560

STRING "'places'" (1, 8) (1, 16)

561

NEWLINE '\\n' (1, 16) (1, 17)

562

NAME 'grün' (2, 0) (2, 4)

563

OP '=' (2, 5) (2, 6)

564

STRING "'green'" (2, 7) (2, 14)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

565

"""

566

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

567

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

568

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

569

STRING, ENDMARKER, tok_name, detect_encoding)

570

from io import BytesIO

571

from unittest import TestCase

572

import os, sys, glob

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

573

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

574

def dump_tokens(s):

575

"""Print out the tokens in s in a table format.

576

577

The ENDMARKER is omitted.

578

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

579

f = BytesIO(s.encode('utf-8'))

580

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

581

if type == ENDMARKER:

582

break

583

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

584

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

585

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

586

def roundtrip(f):

587

"""

588

Test roundtrip for `untokenize`. `f` is an open file or a string.

589

The source code in f is tokenized, converted back to source code via

590

tokenize.untokenize(), and tokenized again from the latter. The test

591

fails if the second tokenization doesn't match the first.

592

"""

593

if isinstance(f, str):

594

f = BytesIO(f.encode('utf-8'))

Brian Curtin

a0ba0f3

2010-10-30 21:37:28 +0000

[diff] [blame]

595

try:

596

token_list = list(tokenize(f.readline))

597

finally:

598

f.close()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

599

tokens1 = [tok[:2] for tok in token_list]

600

new_bytes = untokenize(tokens1)

601

readline = (line for line in new_bytes.splitlines(1)).__next__

602

tokens2 = [tok[:2] for tok in tokenize(readline)]

603

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

604

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

605

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

606

def decistmt(s):

607

"""Substitute Decimals for floats in a string of statements.

608

609

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

610

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

611

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

612

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

613

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

614

The format of the exponent is inherited from the platform C library.

615

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

616

we're only showing 12 digits, and the 13th isn't close to 5, the

617

rest of the output should be platform-independent.

618

619

>>> exec(s) #doctest: +ELLIPSIS

620

-3.21716034272e-0...7

621

622

Output from calculations with Decimal should be identical across all

623

platforms.

624

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

625

>>> exec(decistmt(s))

626

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

627

"""

628

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

629

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

630

for toknum, tokval, _, _, _ in g:

631

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

640

return untokenize(result).decode('utf-8')

641

642

643

class TestTokenizerAdheresToPep0263(TestCase):

644

"""

645

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

646

"""

647

648

def _testFile(self, filename):

649

path = os.path.join(os.path.dirname(__file__), filename)

650

return roundtrip(open(path, 'rb'))

651

652

def test_utf8_coding_cookie_and_no_utf8_bom(self):

653

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

654

self.assertTrue(self._testFile(f))

655

656

def test_latin1_coding_cookie_and_utf8_bom(self):

657

"""

658

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

659

allowed encoding for the comment is 'utf-8'. The text file used in

660

this test starts with a BOM signature, but specifies latin1 as the

661

coding, so verify that a SyntaxError is raised, which matches the

662

behaviour of the interpreter when it encounters a similar condition.

663

"""

664

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Georg Brandl

ab91fde

2009-08-13 08:51:18 +0000

[diff] [blame]

665

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

666

667

def test_no_coding_cookie_and_utf8_bom(self):

668

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

669

self.assertTrue(self._testFile(f))

670

671

def test_utf8_coding_cookie_and_utf8_bom(self):

672

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

673

self.assertTrue(self._testFile(f))

674

675

676

class Test_Tokenize(TestCase):

677

678

def test__tokenize_decodes_with_specified_encoding(self):

679

literal = '"ЉЊЈЁЂ"'

680

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

691

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

692

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

693

self.assertEqual(tokens, expected_tokens,

694

"bytes not decoded with encoding")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

695

696

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

709

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

710

self.assertEqual(tokens, expected_tokens,

711

"string not tokenized when encoding is None")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

712

713

714

class TestDetectEncoding(TestCase):

715

716

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

728

lines = (

729

b'# something\n',

730

b'print(something)\n',

731

b'do_something(else)\n'

732

)

733

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

734

self.assertEqual(encoding, 'utf-8')

735

self.assertEqual(consumed_lines, list(lines[:2]))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

736

737

def test_bom_no_cookie(self):

738

lines = (

739

b'\xef\xbb\xbf# something\n',

740

b'print(something)\n',

741

b'do_something(else)\n'

742

)

743

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

744

self.assertEqual(encoding, 'utf-8')

745

self.assertEqual(consumed_lines,

746

[b'# something\n', b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

747

748

def test_cookie_first_line_no_bom(self):

749

lines = (

750

b'# -*- coding: latin-1 -*-\n',

751

b'print(something)\n',

752

b'do_something(else)\n'

753

)

754

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

755

self.assertEqual(encoding, 'iso-8859-1')

756

self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

757

758

def test_matched_bom_and_cookie_first_line(self):

759

lines = (

760

b'\xef\xbb\xbf# coding=utf-8\n',

761

b'print(something)\n',

762

b'do_something(else)\n'

763

)

764

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

765

self.assertEqual(encoding, 'utf-8')

766

self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

767

768

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

769

lines = (

770

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

771

b'print(something)\n',

772

b'do_something(else)\n'

773

)

774

readline = self.get_readline(lines)

775

self.assertRaises(SyntaxError, detect_encoding, readline)

776

777

def test_cookie_second_line_no_bom(self):

778

lines = (

779

b'#! something\n',

780

b'# vim: set fileencoding=ascii :\n',

781

b'print(something)\n',

782

b'do_something(else)\n'

783

)

784

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

785

self.assertEqual(encoding, 'ascii')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

786

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

787

self.assertEqual(consumed_lines, expected)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

788

789

def test_matched_bom_and_cookie_second_line(self):

790

lines = (

791

b'\xef\xbb\xbf#! something\n',

792

b'f# coding=utf-8\n',

793

b'print(something)\n',

794

b'do_something(else)\n'

795

)

796

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

797

self.assertEqual(encoding, 'utf-8')

798

self.assertEqual(consumed_lines,

799

[b'#! something\n', b'f# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

800

801

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

802

lines = (

803

b'\xef\xbb\xbf#! something\n',

804

b'# vim: set fileencoding=ascii :\n',

805

b'print(something)\n',

806

b'do_something(else)\n'

807

)

808

readline = self.get_readline(lines)

809

self.assertRaises(SyntaxError, detect_encoding, readline)

810

Benjamin Peterson

0c7f9c9

2009-10-09 21:53:27 +0000

[diff] [blame]

811

def test_latin1_normalization(self):

812

# See get_normal_name() in tokenizer.c.

813

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

814

"iso-8859-1-unix", "iso-latin-1-mac")

815

for encoding in encodings:

816

for rep in ("-", "_"):

817

enc = encoding.replace("-", rep)

818

lines = (b"#!/usr/bin/python\n",

819

b"# coding: " + enc.encode("ascii") + b"\n",

820

b"print(things)\n",

821

b"do_something += 4\n")

822

rl = self.get_readline(lines)

823

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

824

self.assertEqual(found, "iso-8859-1")

Benjamin Peterson

0c7f9c9

2009-10-09 21:53:27 +0000

[diff] [blame]

825

826

def test_utf8_normalization(self):

827

# See get_normal_name() in tokenizer.c.

828

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

829

for encoding in encodings:

830

for rep in ("-", "_"):

831

enc = encoding.replace("-", rep)

832

lines = (b"#!/usr/bin/python\n",

833

b"# coding: " + enc.encode("ascii") + b"\n",

834

b"1 + 3\n")

835

rl = self.get_readline(lines)

836

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

837

self.assertEqual(found, "utf-8")

Benjamin Peterson

0c7f9c9

2009-10-09 21:53:27 +0000

[diff] [blame]

838

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

839

def test_short_files(self):

840

readline = self.get_readline((b'print(something)\n',))

841

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

842

self.assertEqual(encoding, 'utf-8')

843

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

844

845

encoding, consumed_lines = detect_encoding(self.get_readline(()))

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

846

self.assertEqual(encoding, 'utf-8')

847

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

848

849

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

850

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

851

self.assertEqual(encoding, 'utf-8')

852

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

853

854

readline = self.get_readline((b'\xef\xbb\xbf',))

855

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

856

self.assertEqual(encoding, 'utf-8')

857

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

858

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

859

readline = self.get_readline((b'# coding: bad\n',))

860

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

861

862

class TestTokenize(TestCase):

863

864

def test_tokenize(self):

865

import tokenize as tokenize_module

866

encoding = object()

867

encoding_used = None

868

def mock_detect_encoding(readline):

869

return encoding, ['first', 'second']

870

871

def mock__tokenize(readline, encoding):

872

nonlocal encoding_used

873

encoding_used = encoding

874

out = []

875

while True:

876

next_line = readline()

877

if next_line:

878

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

891

orig__tokenize = tokenize_module._tokenize

892

tokenize_module.detect_encoding = mock_detect_encoding

893

tokenize_module._tokenize = mock__tokenize

894

try:

895

results = tokenize(mock_readline)

Ezio Melotti

2010-11-21 01:30:29 +0000

[diff] [blame]

896

self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

897

finally:

898

tokenize_module.detect_encoding = orig_detect_encoding

899

tokenize_module._tokenize = orig__tokenize

900

901

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

902

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

903

904

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

905

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

906

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

907

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

908

support.run_doctest(test_tokenize, True)

909

support.run_unittest(TestTokenizerAdheresToPep0263)

910

support.run_unittest(Test_Tokenize)

911

support.run_unittest(TestDetectEncoding)

912

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

913

Thomas Wouters