Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-18 22:41:35 +0000

[diff] [blame]

1

# -*- coding: utf-8 -*-

2

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

3

doctests = """

4

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

5

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

6

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

7

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

8

brevity.

9

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

11

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

12

NUMBER '1' (1, 0) (1, 1)

13

OP '+' (1, 2) (1, 3)

14

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

15

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

16

>>> dump_tokens("if False:\\n"

17

... " # NL\\n"

18

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

19

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

20

NAME 'if' (1, 0) (1, 2)

21

NAME 'False' (1, 3) (1, 8)

22

OP ':' (1, 8) (1, 9)

23

NEWLINE '\\n' (1, 9) (1, 10)

24

COMMENT '# NL' (2, 4) (2, 8)

25

NL '\\n' (2, 8) (2, 9)

26

INDENT ' ' (3, 0) (3, 4)

27

NAME 'True' (3, 4) (3, 8)

28

OP '=' (3, 9) (3, 10)

29

NAME 'False' (3, 11) (3, 16)

30

COMMENT '# NEWLINE' (3, 17) (3, 26)

31

NEWLINE '\\n' (3, 26) (3, 27)

32

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

33

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

34

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

39

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

40

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

41

Traceback (most recent call last):

42

...

43

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

There are some standard formattig practises that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

46

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

47

>>> roundtrip("if x == 1:\\n"

48

... " print(x)\\n")

49

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

50

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

51

>>> roundtrip("# This is a comment\\n# This also")

52

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

53

54

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

55

untokenize a little trickier. Note that this test involves trailing

56

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

57

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

58

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

59

>>> roundtrip("if x == 1 : \\n"

60

... " print(x)\\n")

61

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

62

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

63

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

64

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

66

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

67

>>> roundtrip("if x == 1:\\n"

68

... " # A comment by itself.\\n"

69

... " print(x) # Comment here, too.\\n"

70

... " # Another comment.\\n"

71

... "after_if = True\\n")

72

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

73

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

74

>>> roundtrip("if (x # The comments need to go in the right place\\n"

75

... " == 1):\\n"

76

... " print('x==1')\\n")

77

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

78

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

79

>>> roundtrip("class Test: # A comment here\\n"

80

... " # A comment with weird indent\\n"

81

... " after_com = 5\\n"

82

... " def x(m): return m*5 # a one liner\\n"

83

... " def y(m): # A whitespace after the colon\\n"

84

... " return y*4 # 3-space indent\\n")

85

True

86

87

Some error-handling code

88

89

>>> roundtrip("try: import somemodule\\n"

90

... "except ImportError: # comment\\n"

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

91

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

92

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

93

True

94

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

95

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

96

97

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

102

... "'b':15, 'c':True}\\n"

103

... "x = len(y) + 5 - a[\\n"

104

... "3] - a[2]\\n"

105

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

110

111

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

112

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

113

NUMBER '0xff' (1, 0) (1, 4)

114

OP '<=' (1, 5) (1, 7)

115

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

117

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

118

NUMBER '0b10' (1, 0) (1, 4)

119

OP '<=' (1, 5) (1, 7)

120

NUMBER '255' (1, 8) (1, 11)

121

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

122

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

123

NUMBER '0o123' (1, 0) (1, 5)

124

OP '<=' (1, 6) (1, 8)

125

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

127

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

128

NUMBER '1234567' (1, 0) (1, 7)

129

OP '>' (1, 8) (1, 9)

130

OP '~' (1, 10) (1, 11)

131

NUMBER '0x15' (1, 11) (1, 15)

132

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

133

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

134

NUMBER '2134568' (1, 0) (1, 7)

135

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

136

NUMBER '1231515' (1, 11) (1, 18)

137

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

138

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

139

OP '(' (1, 0) (1, 1)

140

OP '-' (1, 1) (1, 2)

141

NUMBER '124561' (1, 2) (1, 8)

142

OP '-' (1, 8) (1, 9)

143

NUMBER '1' (1, 9) (1, 10)

144

OP ')' (1, 10) (1, 11)

145

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

146

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

148

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

149

NUMBER '0xdeadbeef' (1, 0) (1, 10)

150

OP '!=' (1, 11) (1, 13)

151

OP '-' (1, 14) (1, 15)

152

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

153

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

154

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

155

NUMBER '0xdeadc0de' (1, 0) (1, 10)

156

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

157

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

159

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

160

NUMBER '0xFF' (1, 0) (1, 4)

161

OP '&' (1, 5) (1, 6)

162

NUMBER '0x15' (1, 7) (1, 11)

163

OP '|' (1, 12) (1, 13)

164

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

168

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

169

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

170

NAME 'x' (1, 0) (1, 1)

171

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

172

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

174

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

175

NAME 'x' (1, 0) (1, 1)

176

OP '=' (1, 2) (1, 3)

177

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

178

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

179

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

180

NAME 'x' (1, 0) (1, 1)

181

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

182

NUMBER '123141242151 (1, 4) (1, 25)

183

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

184

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

185

NAME 'x' (1, 0) (1, 1)

186

OP '=' (1, 2) (1, 3)

187

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

188

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

189

190

Floating point numbers

191

192

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

193

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

194

NAME 'x' (1, 0) (1, 1)

195

OP '=' (1, 2) (1, 3)

196

NUMBER '3.14159' (1, 4) (1, 11)

197

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

198

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

199

NAME 'x' (1, 0) (1, 1)

200

OP '=' (1, 2) (1, 3)

201

NUMBER '314159.' (1, 4) (1, 11)

202

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

203

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

204

NAME 'x' (1, 0) (1, 1)

205

OP '=' (1, 2) (1, 3)

206

NUMBER '.314159' (1, 4) (1, 11)

207

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

208

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

209

NAME 'x' (1, 0) (1, 1)

210

OP '=' (1, 2) (1, 3)

211

NUMBER '3e14159' (1, 4) (1, 11)

212

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

213

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

214

NAME 'x' (1, 0) (1, 1)

215

OP '=' (1, 2) (1, 3)

216

NUMBER '3E123' (1, 4) (1, 9)

217

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

218

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

219

NAME 'x' (1, 0) (1, 1)

220

OP '+' (1, 1) (1, 2)

221

NAME 'y' (1, 2) (1, 3)

222

OP '=' (1, 4) (1, 5)

223

NUMBER '3e-1230' (1, 6) (1, 13)

224

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

225

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

226

NAME 'x' (1, 0) (1, 1)

227

OP '=' (1, 2) (1, 3)

228

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

233

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

234

NAME 'x' (1, 0) (1, 1)

235

OP '=' (1, 2) (1, 3)

236

STRING "''" (1, 4) (1, 6)

237

OP ';' (1, 6) (1, 7)

238

NAME 'y' (1, 8) (1, 9)

239

OP '=' (1, 10) (1, 11)

240

STRING '""' (1, 12) (1, 14)

241

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

242

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

243

NAME 'x' (1, 0) (1, 1)

244

OP '=' (1, 2) (1, 3)

245

STRING '\\'"\\'' (1, 4) (1, 7)

246

OP ';' (1, 7) (1, 8)

247

NAME 'y' (1, 9) (1, 10)

248

OP '=' (1, 11) (1, 12)

249

STRING '"\\'"' (1, 13) (1, 16)

250

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

252

NAME 'x' (1, 0) (1, 1)

253

OP '=' (1, 2) (1, 3)

254

STRING '"doesn\\'t "' (1, 4) (1, 14)

255

NAME 'shrink' (1, 14) (1, 20)

256

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

257

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

258

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

259

NAME 'x' (1, 0) (1, 1)

260

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

261

STRING "'abc'" (1, 4) (1, 9)

262

OP '+' (1, 10) (1, 11)

263

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

265

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

266

NAME 'y' (1, 0) (1, 1)

267

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

268

STRING '"ABC"' (1, 4) (1, 9)

269

OP '+' (1, 10) (1, 11)

270

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

272

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

273

NAME 'x' (1, 0) (1, 1)

274

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

275

STRING "r'abc'" (1, 4) (1, 10)

276

OP '+' (1, 11) (1, 12)

277

STRING "r'ABC'" (1, 13) (1, 19)

278

OP '+' (1, 20) (1, 21)

279

STRING "R'ABC'" (1, 22) (1, 28)

280

OP '+' (1, 29) (1, 30)

281

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

284

NAME 'y' (1, 0) (1, 1)

285

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

286

STRING 'r"abc"' (1, 4) (1, 10)

287

OP '+' (1, 11) (1, 12)

288

STRING 'r"ABC"' (1, 13) (1, 19)

289

OP '+' (1, 20) (1, 21)

290

STRING 'R"ABC"' (1, 22) (1, 28)

291

OP '+' (1, 29) (1, 30)

292

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Operators

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

298

NAME 'def' (1, 0) (1, 3)

299

NAME 'd22' (1, 4) (1, 7)

300

OP '(' (1, 7) (1, 8)

301

NAME 'a' (1, 8) (1, 9)

302

OP ',' (1, 9) (1, 10)

303

NAME 'b' (1, 11) (1, 12)

304

OP ',' (1, 12) (1, 13)

305

NAME 'c' (1, 14) (1, 15)

306

OP '=' (1, 15) (1, 16)

307

NUMBER '2' (1, 16) (1, 17)

308

OP ',' (1, 17) (1, 18)

309

NAME 'd' (1, 19) (1, 20)

310

OP '=' (1, 20) (1, 21)

311

NUMBER '2' (1, 21) (1, 22)

312

OP ',' (1, 22) (1, 23)

313

OP '*' (1, 24) (1, 25)

314

NAME 'k' (1, 25) (1, 26)

315

OP ')' (1, 26) (1, 27)

316

OP ':' (1, 27) (1, 28)

317

NAME 'pass' (1, 29) (1, 33)

318

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

320

NAME 'def' (1, 0) (1, 3)

321

NAME 'd01v_' (1, 4) (1, 9)

322

OP '(' (1, 9) (1, 10)

323

NAME 'a' (1, 10) (1, 11)

324

OP '=' (1, 11) (1, 12)

325

NUMBER '1' (1, 12) (1, 13)

326

OP ',' (1, 13) (1, 14)

327

OP '*' (1, 15) (1, 16)

328

NAME 'k' (1, 16) (1, 17)

329

OP ',' (1, 17) (1, 18)

330

OP '**' (1, 19) (1, 21)

331

NAME 'w' (1, 21) (1, 22)

332

OP ')' (1, 22) (1, 23)

333

OP ':' (1, 23) (1, 24)

334

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

339

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

341

NAME 'if' (1, 0) (1, 2)

342

NUMBER '1' (1, 3) (1, 4)

343

OP '<' (1, 5) (1, 6)

344

NUMBER '1' (1, 7) (1, 8)

345

OP '>' (1, 9) (1, 10)

346

NUMBER '1' (1, 11) (1, 12)

347

OP '==' (1, 13) (1, 15)

348

NUMBER '1' (1, 16) (1, 17)

349

OP '>=' (1, 18) (1, 20)

350

NUMBER '5' (1, 21) (1, 22)

351

OP '<=' (1, 23) (1, 25)

352

NUMBER '0x15' (1, 26) (1, 30)

353

OP '<=' (1, 31) (1, 33)

354

NUMBER '0x12' (1, 34) (1, 38)

355

OP '!=' (1, 39) (1, 41)

356

NUMBER '1' (1, 42) (1, 43)

357

NAME 'and' (1, 44) (1, 47)

358

NUMBER '5' (1, 48) (1, 49)

359

NAME 'in' (1, 50) (1, 52)

360

NUMBER '1' (1, 53) (1, 54)

361

NAME 'not' (1, 55) (1, 58)

362

NAME 'in' (1, 59) (1, 61)

363

NUMBER '1' (1, 62) (1, 63)

364

NAME 'is' (1, 64) (1, 66)

365

NUMBER '1' (1, 67) (1, 68)

366

NAME 'or' (1, 69) (1, 71)

367

NUMBER '5' (1, 72) (1, 73)

368

NAME 'is' (1, 74) (1, 76)

369

NAME 'not' (1, 77) (1, 80)

370

NUMBER '1' (1, 81) (1, 82)

371

OP ':' (1, 82) (1, 83)

372

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

378

NAME 'x' (1, 0) (1, 1)

379

OP '=' (1, 2) (1, 3)

380

NUMBER '1' (1, 4) (1, 5)

381

OP '<<' (1, 6) (1, 8)

382

NUMBER '1' (1, 9) (1, 10)

383

OP '>>' (1, 11) (1, 13)

384

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

388

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

389

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

390

NAME 'x' (1, 0) (1, 1)

391

OP '=' (1, 2) (1, 3)

392

NUMBER '1' (1, 4) (1, 5)

393

OP '-' (1, 6) (1, 7)

394

NAME 'y' (1, 8) (1, 9)

395

OP '+' (1, 10) (1, 11)

396

NUMBER '15' (1, 12) (1, 14)

397

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

398

NUMBER '1' (1, 17) (1, 18)

399

OP '+' (1, 19) (1, 20)

400

NUMBER '0x124' (1, 21) (1, 26)

401

OP '+' (1, 27) (1, 28)

402

NAME 'z' (1, 29) (1, 30)

403

OP '+' (1, 31) (1, 32)

404

NAME 'a' (1, 33) (1, 34)

405

OP '[' (1, 34) (1, 35)

406

NUMBER '5' (1, 35) (1, 36)

407

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

412

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

413

NAME 'x' (1, 0) (1, 1)

414

OP '=' (1, 2) (1, 3)

415

NUMBER '1' (1, 4) (1, 5)

416

OP '//' (1, 5) (1, 7)

417

NUMBER '1' (1, 7) (1, 8)

418

OP '*' (1, 8) (1, 9)

419

NUMBER '1' (1, 9) (1, 10)

420

OP '/' (1, 10) (1, 11)

421

NUMBER '5' (1, 11) (1, 12)

422

OP '*' (1, 12) (1, 13)

423

NUMBER '12' (1, 13) (1, 15)

424

OP '%' (1, 15) (1, 16)

425

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

430

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

431

OP '~' (1, 0) (1, 1)

432

NUMBER '1' (1, 1) (1, 2)

433

OP '^' (1, 3) (1, 4)

434

NUMBER '1' (1, 5) (1, 6)

435

OP '&' (1, 7) (1, 8)

436

NUMBER '1' (1, 9) (1, 10)

437

OP '|' (1, 11) (1, 12)

438

NUMBER '1' (1, 12) (1, 13)

439

OP '^' (1, 14) (1, 15)

440

OP '-' (1, 16) (1, 17)

441

NUMBER '1' (1, 17) (1, 18)

442

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

443

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

444

OP '-' (1, 0) (1, 1)

445

NUMBER '1' (1, 1) (1, 2)

446

OP '*' (1, 2) (1, 3)

447

NUMBER '1' (1, 3) (1, 4)

448

OP '/' (1, 4) (1, 5)

449

NUMBER '1' (1, 5) (1, 6)

450

OP '+' (1, 6) (1, 7)

451

NUMBER '1' (1, 7) (1, 8)

452

OP '*' (1, 8) (1, 9)

453

NUMBER '1' (1, 9) (1, 10)

454

OP '//' (1, 10) (1, 12)

455

NUMBER '1' (1, 12) (1, 13)

456

OP '-' (1, 14) (1, 15)

457

OP '-' (1, 16) (1, 17)

458

OP '-' (1, 17) (1, 18)

459

OP '-' (1, 18) (1, 19)

460

NUMBER '1' (1, 19) (1, 20)

461

OP '**' (1, 20) (1, 22)

462

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

467

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

468

NAME 'import' (1, 0) (1, 6)

469

NAME 'sys' (1, 7) (1, 10)

470

OP ',' (1, 10) (1, 11)

471

NAME 'time' (1, 12) (1, 16)

472

NEWLINE '\\n' (1, 16) (1, 17)

473

NAME 'x' (2, 0) (2, 1)

474

OP '=' (2, 2) (2, 3)

475

NAME 'sys' (2, 4) (2, 7)

476

OP '.' (2, 7) (2, 8)

477

NAME 'modules' (2, 8) (2, 15)

478

OP '[' (2, 15) (2, 16)

479

STRING "'time'" (2, 16) (2, 22)

480

OP ']' (2, 22) (2, 23)

481

OP '.' (2, 23) (2, 24)

482

NAME 'time' (2, 24) (2, 28)

483

OP '(' (2, 28) (2, 29)

484

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

489

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

490

OP '@' (1, 0) (1, 1)

491

NAME 'staticmethod (1, 1) (1, 13)

492

NEWLINE '\\n' (1, 13) (1, 14)

493

NAME 'def' (2, 0) (2, 3)

494

NAME 'foo' (2, 4) (2, 7)

495

OP '(' (2, 7) (2, 8)

496

NAME 'x' (2, 8) (2, 9)

497

OP ',' (2, 9) (2, 10)

498

NAME 'y' (2, 10) (2, 11)

499

OP ')' (2, 11) (2, 12)

500

OP ':' (2, 12) (2, 13)

501

NAME 'pass' (2, 14) (2, 18)

502

503

Backslash means line continuation, except for comments

504

505

>>> roundtrip("x=1+\\\\n"

506

... "1\\n"

507

... "# This is a comment\\\\n"

508

... "# This also\\n")

509

True

510

>>> roundtrip("# Comment \\\\nx = 0")

511

True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

512

513

Two string literals on the same line

514

515

>>> roundtrip("'' ''")

516

True

517

518

Test roundtrip on random python modules.

519

pass the '-ucompiler' option to process the full directory.

520

521

>>> import random

522

>>> tempdir = os.path.dirname(f) or os.curdir

523

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

524

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

525

>>> if not support.is_resource_enabled("compiler"):

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

526

... testfiles = random.sample(testfiles, 10)

527

...

528

>>> for testfile in testfiles:

529

... if not roundtrip(open(testfile, 'rb')):

530

... print("Roundtrip failed for file %s" % testfile)

531

... break

532

... else: True

533

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

534

535

Evil tabs

536

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

537

ENCODING 'utf-8' (0, 0) (0, 0)

538

NAME 'def' (1, 0) (1, 3)

539

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

544

INDENT '\\t' (2, 0) (2, 1)

545

NAME 'if' (2, 1) (2, 3)

546

NAME 'x' (2, 4) (2, 5)

547

NEWLINE '\\n' (2, 5) (2, 6)

548

INDENT ' \\t' (3, 0) (3, 9)

549

NAME 'pass' (3, 9) (3, 13)

550

DEDENT '' (4, 0) (4, 0)

551

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

552

"""

553

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

554

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

555

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

556

STRING, ENDMARKER, tok_name, detect_encoding)

557

from io import BytesIO

558

from unittest import TestCase

559

import os, sys, glob

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

560

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

561

def dump_tokens(s):

562

"""Print out the tokens in s in a table format.

563

564

The ENDMARKER is omitted.

565

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

566

f = BytesIO(s.encode('utf-8'))

567

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

568

if type == ENDMARKER:

569

break

570

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

571

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

572

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

573

def roundtrip(f):

574

"""

575

Test roundtrip for `untokenize`. `f` is an open file or a string.

576

The source code in f is tokenized, converted back to source code via

577

tokenize.untokenize(), and tokenized again from the latter. The test

578

fails if the second tokenization doesn't match the first.

579

"""

580

if isinstance(f, str):

581

f = BytesIO(f.encode('utf-8'))

582

token_list = list(tokenize(f.readline))

583

f.close()

584

tokens1 = [tok[:2] for tok in token_list]

585

new_bytes = untokenize(tokens1)

586

readline = (line for line in new_bytes.splitlines(1)).__next__

587

tokens2 = [tok[:2] for tok in tokenize(readline)]

588

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

589

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

590

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

591

def decistmt(s):

592

"""Substitute Decimals for floats in a string of statements.

593

594

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

595

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

596

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

597

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

598

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

599

The format of the exponent is inherited from the platform C library.

600

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

601

we're only showing 12 digits, and the 13th isn't close to 5, the

602

rest of the output should be platform-independent.

603

604

>>> exec(s) #doctest: +ELLIPSIS

605

-3.21716034272e-0...7

606

607

Output from calculations with Decimal should be identical across all

608

platforms.

609

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

610

>>> exec(decistmt(s))

611

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

612

"""

613

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

614

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

615

for toknum, tokval, _, _, _ in g:

616

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

625

return untokenize(result).decode('utf-8')

626

627

628

class TestTokenizerAdheresToPep0263(TestCase):

629

"""

630

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

631

"""

632

633

def _testFile(self, filename):

634

path = os.path.join(os.path.dirname(__file__), filename)

635

return roundtrip(open(path, 'rb'))

636

637

def test_utf8_coding_cookie_and_no_utf8_bom(self):

638

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

639

self.assertTrue(self._testFile(f))

640

641

def test_latin1_coding_cookie_and_utf8_bom(self):

642

"""

643

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

644

allowed encoding for the comment is 'utf-8'. The text file used in

645

this test starts with a BOM signature, but specifies latin1 as the

646

coding, so verify that a SyntaxError is raised, which matches the

647

behaviour of the interpreter when it encounters a similar condition.

648

"""

649

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

650

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

651

652

def test_no_coding_cookie_and_utf8_bom(self):

653

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

654

self.assertTrue(self._testFile(f))

655

656

def test_utf8_coding_cookie_and_utf8_bom(self):

657

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

658

self.assertTrue(self._testFile(f))

659

660

661

class Test_Tokenize(TestCase):

662

663

def test__tokenize_decodes_with_specified_encoding(self):

664

literal = '"ЉЊЈЁЂ"'

665

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

676

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

677

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

678

self.assertEquals(tokens, expected_tokens,

679

"bytes not decoded with encoding")

680

681

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

694

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

695

self.assertEquals(tokens, expected_tokens,

696

"string not tokenized when encoding is None")

697

698

699

class TestDetectEncoding(TestCase):

700

701

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

713

lines = (

714

b'# something\n',

715

b'print(something)\n',

716

b'do_something(else)\n'

717

)

718

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

719

self.assertEquals(encoding, 'utf-8')

720

self.assertEquals(consumed_lines, list(lines[:2]))

721

722

def test_bom_no_cookie(self):

723

lines = (

724

b'\xef\xbb\xbf# something\n',

725

b'print(something)\n',

726

b'do_something(else)\n'

727

)

728

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

729

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

730

self.assertEquals(consumed_lines,

731

[b'# something\n', b'print(something)\n'])

732

733

def test_cookie_first_line_no_bom(self):

734

lines = (

735

b'# -*- coding: latin-1 -*-\n',

736

b'print(something)\n',

737

b'do_something(else)\n'

738

)

739

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

740

self.assertEquals(encoding, 'iso-8859-1')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

741

self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

742

743

def test_matched_bom_and_cookie_first_line(self):

744

lines = (

745

b'\xef\xbb\xbf# coding=utf-8\n',

746

b'print(something)\n',

747

b'do_something(else)\n'

748

)

749

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

750

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

751

self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])

752

753

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

754

lines = (

755

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

756

b'print(something)\n',

757

b'do_something(else)\n'

758

)

759

readline = self.get_readline(lines)

760

self.assertRaises(SyntaxError, detect_encoding, readline)

761

762

def test_cookie_second_line_no_bom(self):

763

lines = (

764

b'#! something\n',

765

b'# vim: set fileencoding=ascii :\n',

766

b'print(something)\n',

767

b'do_something(else)\n'

768

)

769

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

770

self.assertEquals(encoding, 'ascii')

771

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

772

self.assertEquals(consumed_lines, expected)

773

774

def test_matched_bom_and_cookie_second_line(self):

775

lines = (

776

b'\xef\xbb\xbf#! something\n',

777

b'f# coding=utf-8\n',

778

b'print(something)\n',

779

b'do_something(else)\n'

780

)

781

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

782

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

783

self.assertEquals(consumed_lines,

784

[b'#! something\n', b'f# coding=utf-8\n'])

785

786

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

787

lines = (

788

b'\xef\xbb\xbf#! something\n',

789

b'# vim: set fileencoding=ascii :\n',

790

b'print(something)\n',

791

b'do_something(else)\n'

792

)

793

readline = self.get_readline(lines)

794

self.assertRaises(SyntaxError, detect_encoding, readline)

795

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

796

def test_latin1_normalization(self):

797

# See get_normal_name() in tokenizer.c.

798

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

799

"iso-8859-1-unix", "iso-latin-1-mac")

800

for encoding in encodings:

801

for rep in ("-", "_"):

802

enc = encoding.replace("-", rep)

803

lines = (b"#!/usr/bin/python\n",

804

b"# coding: " + enc.encode("ascii") + b"\n",

805

b"print(things)\n",

806

b"do_something += 4\n")

807

rl = self.get_readline(lines)

808

found, consumed_lines = detect_encoding(rl)

809

self.assertEquals(found, "iso-8859-1")

810

811

def test_utf8_normalization(self):

812

# See get_normal_name() in tokenizer.c.

813

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

814

for encoding in encodings:

815

for rep in ("-", "_"):

816

enc = encoding.replace("-", rep)

817

lines = (b"#!/usr/bin/python\n",

818

b"# coding: " + enc.encode("ascii") + b"\n",

819

b"1 + 3\n")

820

rl = self.get_readline(lines)

821

found, consumed_lines = detect_encoding(rl)

822

self.assertEquals(found, "utf-8")

823

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

824

def test_short_files(self):

825

readline = self.get_readline((b'print(something)\n',))

826

encoding, consumed_lines = detect_encoding(readline)

827

self.assertEquals(encoding, 'utf-8')

828

self.assertEquals(consumed_lines, [b'print(something)\n'])

829

830

encoding, consumed_lines = detect_encoding(self.get_readline(()))

831

self.assertEquals(encoding, 'utf-8')

832

self.assertEquals(consumed_lines, [])

833

834

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

835

encoding, consumed_lines = detect_encoding(readline)

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

836

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

837

self.assertEquals(consumed_lines, [b'print(something)\n'])

838

839

readline = self.get_readline((b'\xef\xbb\xbf',))

840

encoding, consumed_lines = detect_encoding(readline)

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

841

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

842

self.assertEquals(consumed_lines, [])

843

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

844

readline = self.get_readline((b'# coding: bad\n',))

845

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

846

847

class TestTokenize(TestCase):

848

849

def test_tokenize(self):

850

import tokenize as tokenize_module

851

encoding = object()

852

encoding_used = None

853

def mock_detect_encoding(readline):

854

return encoding, ['first', 'second']

855

856

def mock__tokenize(readline, encoding):

857

nonlocal encoding_used

858

encoding_used = encoding

859

out = []

860

while True:

861

next_line = readline()

862

if next_line:

863

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

876

orig__tokenize = tokenize_module._tokenize

877

tokenize_module.detect_encoding = mock_detect_encoding

878

tokenize_module._tokenize = mock__tokenize

879

try:

880

results = tokenize(mock_readline)

881

self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])

882

finally:

883

tokenize_module.detect_encoding = orig_detect_encoding

884

tokenize_module._tokenize = orig__tokenize

885

886

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

887

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

888

889

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

890

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

891

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

892

from test import test_tokenize

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

893

support.run_doctest(test_tokenize, True)

894

support.run_unittest(TestTokenizerAdheresToPep0263)

895

support.run_unittest(Test_Tokenize)

896

support.run_unittest(TestDetectEncoding)

897

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

898

Thomas Wouters