Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-16 00:07:10 +0000

[diff] [blame]

1

doctests = """

2

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

3

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

4

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

5

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

6

brevity.

7

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

8

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

9

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

NUMBER '1' (1, 0) (1, 1)

11

OP '+' (1, 2) (1, 3)

12

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

13

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

14

>>> dump_tokens("if False:\\n"

15

... " # NL\\n"

16

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

17

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

18

NAME 'if' (1, 0) (1, 2)

19

NAME 'False' (1, 3) (1, 8)

20

OP ':' (1, 8) (1, 9)

21

NEWLINE '\\n' (1, 9) (1, 10)

22

COMMENT '# NL' (2, 4) (2, 8)

23

NL '\\n' (2, 8) (2, 9)

24

INDENT ' ' (3, 0) (3, 4)

25

NAME 'True' (3, 4) (3, 8)

26

OP '=' (3, 9) (3, 10)

27

NAME 'False' (3, 11) (3, 16)

28

COMMENT '# NEWLINE' (3, 17) (3, 26)

29

NEWLINE '\\n' (3, 26) (3, 27)

30

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

31

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

32

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

37

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

38

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

39

Traceback (most recent call last):

40

...

41

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

42

Mark Dickinson

3c0b317

2010-06-29 07:38:37 +0000

[diff] [blame]

43

There are some standard formatting practices that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

>>> roundtrip("if x == 1:\\n"

46

... " print(x)\\n")

47

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

48

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

49

>>> roundtrip("# This is a comment\\n# This also")

50

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

51

52

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

53

untokenize a little trickier. Note that this test involves trailing

54

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

55

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

56

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

57

>>> roundtrip("if x == 1 : \\n"

58

... " print(x)\\n")

59

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

60

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

61

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

62

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

63

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

64

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

>>> roundtrip("if x == 1:\\n"

66

... " # A comment by itself.\\n"

67

... " print(x) # Comment here, too.\\n"

68

... " # Another comment.\\n"

69

... "after_if = True\\n")

70

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

71

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

72

>>> roundtrip("if (x # The comments need to go in the right place\\n"

73

... " == 1):\\n"

74

... " print('x==1')\\n")

75

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

76

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

77

>>> roundtrip("class Test: # A comment here\\n"

78

... " # A comment with weird indent\\n"

79

... " after_com = 5\\n"

80

... " def x(m): return m*5 # a one liner\\n"

81

... " def y(m): # A whitespace after the colon\\n"

82

... " return y*4 # 3-space indent\\n")

83

True

84

85

Some error-handling code

86

87

>>> roundtrip("try: import somemodule\\n"

88

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

89

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

90

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

91

True

92

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

93

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

94

95

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

100

... "'b':15, 'c':True}\\n"

101

... "x = len(y) + 5 - a[\\n"

102

... "3] - a[2]\\n"

103

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

108

109

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

110

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

111

NUMBER '0xff' (1, 0) (1, 4)

112

OP '<=' (1, 5) (1, 7)

113

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

114

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

115

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

NUMBER '0b10' (1, 0) (1, 4)

117

OP '<=' (1, 5) (1, 7)

118

NUMBER '255' (1, 8) (1, 11)

119

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

120

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

121

NUMBER '0o123' (1, 0) (1, 5)

122

OP '<=' (1, 6) (1, 8)

123

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

124

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

125

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

NUMBER '1234567' (1, 0) (1, 7)

127

OP '>' (1, 8) (1, 9)

128

OP '~' (1, 10) (1, 11)

129

NUMBER '0x15' (1, 11) (1, 15)

130

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

131

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

132

NUMBER '2134568' (1, 0) (1, 7)

133

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

134

NUMBER '1231515' (1, 11) (1, 18)

135

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

136

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

137

OP '(' (1, 0) (1, 1)

138

OP '-' (1, 1) (1, 2)

139

NUMBER '124561' (1, 2) (1, 8)

140

OP '-' (1, 8) (1, 9)

141

NUMBER '1' (1, 9) (1, 10)

142

OP ')' (1, 10) (1, 11)

143

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

144

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

145

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

146

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

NUMBER '0xdeadbeef' (1, 0) (1, 10)

148

OP '!=' (1, 11) (1, 13)

149

OP '-' (1, 14) (1, 15)

150

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

151

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

152

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

153

NUMBER '0xdeadc0de' (1, 0) (1, 10)

154

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

155

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

156

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

157

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

NUMBER '0xFF' (1, 0) (1, 4)

159

OP '&' (1, 5) (1, 6)

160

NUMBER '0x15' (1, 7) (1, 11)

161

OP '|' (1, 12) (1, 13)

162

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

166

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

167

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

168

NAME 'x' (1, 0) (1, 1)

169

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

170

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

171

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

172

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

NAME 'x' (1, 0) (1, 1)

174

OP '=' (1, 2) (1, 3)

175

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

176

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

177

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

178

NAME 'x' (1, 0) (1, 1)

179

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

180

NUMBER '123141242151 (1, 4) (1, 25)

181

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

182

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

183

NAME 'x' (1, 0) (1, 1)

184

OP '=' (1, 2) (1, 3)

185

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

186

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

187

188

Floating point numbers

189

190

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

191

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

192

NAME 'x' (1, 0) (1, 1)

193

OP '=' (1, 2) (1, 3)

194

NUMBER '3.14159' (1, 4) (1, 11)

195

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

196

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

197

NAME 'x' (1, 0) (1, 1)

198

OP '=' (1, 2) (1, 3)

199

NUMBER '314159.' (1, 4) (1, 11)

200

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

201

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

202

NAME 'x' (1, 0) (1, 1)

203

OP '=' (1, 2) (1, 3)

204

NUMBER '.314159' (1, 4) (1, 11)

205

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

206

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

207

NAME 'x' (1, 0) (1, 1)

208

OP '=' (1, 2) (1, 3)

209

NUMBER '3e14159' (1, 4) (1, 11)

210

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

211

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

212

NAME 'x' (1, 0) (1, 1)

213

OP '=' (1, 2) (1, 3)

214

NUMBER '3E123' (1, 4) (1, 9)

215

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

216

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

217

NAME 'x' (1, 0) (1, 1)

218

OP '+' (1, 1) (1, 2)

219

NAME 'y' (1, 2) (1, 3)

220

OP '=' (1, 4) (1, 5)

221

NUMBER '3e-1230' (1, 6) (1, 13)

222

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

224

NAME 'x' (1, 0) (1, 1)

225

OP '=' (1, 2) (1, 3)

226

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

232

NAME 'x' (1, 0) (1, 1)

233

OP '=' (1, 2) (1, 3)

234

STRING "''" (1, 4) (1, 6)

235

OP ';' (1, 6) (1, 7)

236

NAME 'y' (1, 8) (1, 9)

237

OP '=' (1, 10) (1, 11)

238

STRING '""' (1, 12) (1, 14)

239

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

240

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

241

NAME 'x' (1, 0) (1, 1)

242

OP '=' (1, 2) (1, 3)

243

STRING '\\'"\\'' (1, 4) (1, 7)

244

OP ';' (1, 7) (1, 8)

245

NAME 'y' (1, 9) (1, 10)

246

OP '=' (1, 11) (1, 12)

247

STRING '"\\'"' (1, 13) (1, 16)

248

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

249

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

250

NAME 'x' (1, 0) (1, 1)

251

OP '=' (1, 2) (1, 3)

252

STRING '"doesn\\'t "' (1, 4) (1, 14)

253

NAME 'shrink' (1, 14) (1, 20)

254

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

255

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

256

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

257

NAME 'x' (1, 0) (1, 1)

258

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

259

STRING "'abc'" (1, 4) (1, 9)

260

OP '+' (1, 10) (1, 11)

261

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

262

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

263

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

NAME 'y' (1, 0) (1, 1)

265

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

266

STRING '"ABC"' (1, 4) (1, 9)

267

OP '+' (1, 10) (1, 11)

268

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

269

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

270

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

NAME 'x' (1, 0) (1, 1)

272

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

273

STRING "r'abc'" (1, 4) (1, 10)

274

OP '+' (1, 11) (1, 12)

275

STRING "r'ABC'" (1, 13) (1, 19)

276

OP '+' (1, 20) (1, 21)

277

STRING "R'ABC'" (1, 22) (1, 28)

278

OP '+' (1, 29) (1, 30)

279

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

280

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

281

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

NAME 'y' (1, 0) (1, 1)

283

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

284

STRING 'r"abc"' (1, 4) (1, 10)

285

OP '+' (1, 11) (1, 12)

286

STRING 'r"ABC"' (1, 13) (1, 19)

287

OP '+' (1, 20) (1, 21)

288

STRING 'R"ABC"' (1, 22) (1, 28)

289

OP '+' (1, 29) (1, 30)

290

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Operators

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

295

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

296

NAME 'def' (1, 0) (1, 3)

297

NAME 'd22' (1, 4) (1, 7)

298

OP '(' (1, 7) (1, 8)

299

NAME 'a' (1, 8) (1, 9)

300

OP ',' (1, 9) (1, 10)

301

NAME 'b' (1, 11) (1, 12)

302

OP ',' (1, 12) (1, 13)

303

NAME 'c' (1, 14) (1, 15)

304

OP '=' (1, 15) (1, 16)

305

NUMBER '2' (1, 16) (1, 17)

306

OP ',' (1, 17) (1, 18)

307

NAME 'd' (1, 19) (1, 20)

308

OP '=' (1, 20) (1, 21)

309

NUMBER '2' (1, 21) (1, 22)

310

OP ',' (1, 22) (1, 23)

311

OP '*' (1, 24) (1, 25)

312

NAME 'k' (1, 25) (1, 26)

313

OP ')' (1, 26) (1, 27)

314

OP ':' (1, 27) (1, 28)

315

NAME 'pass' (1, 29) (1, 33)

316

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

318

NAME 'def' (1, 0) (1, 3)

319

NAME 'd01v_' (1, 4) (1, 9)

320

OP '(' (1, 9) (1, 10)

321

NAME 'a' (1, 10) (1, 11)

322

OP '=' (1, 11) (1, 12)

323

NUMBER '1' (1, 12) (1, 13)

324

OP ',' (1, 13) (1, 14)

325

OP '*' (1, 15) (1, 16)

326

NAME 'k' (1, 16) (1, 17)

327

OP ',' (1, 17) (1, 18)

328

OP '**' (1, 19) (1, 21)

329

NAME 'w' (1, 21) (1, 22)

330

OP ')' (1, 22) (1, 23)

331

OP ':' (1, 23) (1, 24)

332

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

337

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

339

NAME 'if' (1, 0) (1, 2)

340

NUMBER '1' (1, 3) (1, 4)

341

OP '<' (1, 5) (1, 6)

342

NUMBER '1' (1, 7) (1, 8)

343

OP '>' (1, 9) (1, 10)

344

NUMBER '1' (1, 11) (1, 12)

345

OP '==' (1, 13) (1, 15)

346

NUMBER '1' (1, 16) (1, 17)

347

OP '>=' (1, 18) (1, 20)

348

NUMBER '5' (1, 21) (1, 22)

349

OP '<=' (1, 23) (1, 25)

350

NUMBER '0x15' (1, 26) (1, 30)

351

OP '<=' (1, 31) (1, 33)

352

NUMBER '0x12' (1, 34) (1, 38)

353

OP '!=' (1, 39) (1, 41)

354

NUMBER '1' (1, 42) (1, 43)

355

NAME 'and' (1, 44) (1, 47)

356

NUMBER '5' (1, 48) (1, 49)

357

NAME 'in' (1, 50) (1, 52)

358

NUMBER '1' (1, 53) (1, 54)

359

NAME 'not' (1, 55) (1, 58)

360

NAME 'in' (1, 59) (1, 61)

361

NUMBER '1' (1, 62) (1, 63)

362

NAME 'is' (1, 64) (1, 66)

363

NUMBER '1' (1, 67) (1, 68)

364

NAME 'or' (1, 69) (1, 71)

365

NUMBER '5' (1, 72) (1, 73)

366

NAME 'is' (1, 74) (1, 76)

367

NAME 'not' (1, 77) (1, 80)

368

NUMBER '1' (1, 81) (1, 82)

369

OP ':' (1, 82) (1, 83)

370

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

375

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

376

NAME 'x' (1, 0) (1, 1)

377

OP '=' (1, 2) (1, 3)

378

NUMBER '1' (1, 4) (1, 5)

379

OP '<<' (1, 6) (1, 8)

380

NUMBER '1' (1, 9) (1, 10)

381

OP '>>' (1, 11) (1, 13)

382

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

386

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

387

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

388

NAME 'x' (1, 0) (1, 1)

389

OP '=' (1, 2) (1, 3)

390

NUMBER '1' (1, 4) (1, 5)

391

OP '-' (1, 6) (1, 7)

392

NAME 'y' (1, 8) (1, 9)

393

OP '+' (1, 10) (1, 11)

394

NUMBER '15' (1, 12) (1, 14)

395

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

396

NUMBER '1' (1, 17) (1, 18)

397

OP '+' (1, 19) (1, 20)

398

NUMBER '0x124' (1, 21) (1, 26)

399

OP '+' (1, 27) (1, 28)

400

NAME 'z' (1, 29) (1, 30)

401

OP '+' (1, 31) (1, 32)

402

NAME 'a' (1, 33) (1, 34)

403

OP '[' (1, 34) (1, 35)

404

NUMBER '5' (1, 35) (1, 36)

405

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

410

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

411

NAME 'x' (1, 0) (1, 1)

412

OP '=' (1, 2) (1, 3)

413

NUMBER '1' (1, 4) (1, 5)

414

OP '//' (1, 5) (1, 7)

415

NUMBER '1' (1, 7) (1, 8)

416

OP '*' (1, 8) (1, 9)

417

NUMBER '1' (1, 9) (1, 10)

418

OP '/' (1, 10) (1, 11)

419

NUMBER '5' (1, 11) (1, 12)

420

OP '*' (1, 12) (1, 13)

421

NUMBER '12' (1, 13) (1, 15)

422

OP '%' (1, 15) (1, 16)

423

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

428

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

429

OP '~' (1, 0) (1, 1)

430

NUMBER '1' (1, 1) (1, 2)

431

OP '^' (1, 3) (1, 4)

432

NUMBER '1' (1, 5) (1, 6)

433

OP '&' (1, 7) (1, 8)

434

NUMBER '1' (1, 9) (1, 10)

435

OP '|' (1, 11) (1, 12)

436

NUMBER '1' (1, 12) (1, 13)

437

OP '^' (1, 14) (1, 15)

438

OP '-' (1, 16) (1, 17)

439

NUMBER '1' (1, 17) (1, 18)

440

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

441

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

442

OP '-' (1, 0) (1, 1)

443

NUMBER '1' (1, 1) (1, 2)

444

OP '*' (1, 2) (1, 3)

445

NUMBER '1' (1, 3) (1, 4)

446

OP '/' (1, 4) (1, 5)

447

NUMBER '1' (1, 5) (1, 6)

448

OP '+' (1, 6) (1, 7)

449

NUMBER '1' (1, 7) (1, 8)

450

OP '*' (1, 8) (1, 9)

451

NUMBER '1' (1, 9) (1, 10)

452

OP '//' (1, 10) (1, 12)

453

NUMBER '1' (1, 12) (1, 13)

454

OP '-' (1, 14) (1, 15)

455

OP '-' (1, 16) (1, 17)

456

OP '-' (1, 17) (1, 18)

457

OP '-' (1, 18) (1, 19)

458

NUMBER '1' (1, 19) (1, 20)

459

OP '**' (1, 20) (1, 22)

460

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

465

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

466

NAME 'import' (1, 0) (1, 6)

467

NAME 'sys' (1, 7) (1, 10)

468

OP ',' (1, 10) (1, 11)

469

NAME 'time' (1, 12) (1, 16)

470

NEWLINE '\\n' (1, 16) (1, 17)

471

NAME 'x' (2, 0) (2, 1)

472

OP '=' (2, 2) (2, 3)

473

NAME 'sys' (2, 4) (2, 7)

474

OP '.' (2, 7) (2, 8)

475

NAME 'modules' (2, 8) (2, 15)

476

OP '[' (2, 15) (2, 16)

477

STRING "'time'" (2, 16) (2, 22)

478

OP ']' (2, 22) (2, 23)

479

OP '.' (2, 23) (2, 24)

480

NAME 'time' (2, 24) (2, 28)

481

OP '(' (2, 28) (2, 29)

482

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

487

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

488

OP '@' (1, 0) (1, 1)

489

NAME 'staticmethod (1, 1) (1, 13)

490

NEWLINE '\\n' (1, 13) (1, 14)

491

NAME 'def' (2, 0) (2, 3)

492

NAME 'foo' (2, 4) (2, 7)

493

OP '(' (2, 7) (2, 8)

494

NAME 'x' (2, 8) (2, 9)

495

OP ',' (2, 9) (2, 10)

496

NAME 'y' (2, 10) (2, 11)

497

OP ')' (2, 11) (2, 12)

498

OP ':' (2, 12) (2, 13)

499

NAME 'pass' (2, 14) (2, 18)

500

501

Backslash means line continuation, except for comments

502

503

>>> roundtrip("x=1+\\\\n"

504

... "1\\n"

505

... "# This is a comment\\\\n"

506

... "# This also\\n")

507

True

508

>>> roundtrip("# Comment \\\\nx = 0")

509

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

510

511

Two string literals on the same line

512

513

>>> roundtrip("'' ''")

514

True

515

516

Test roundtrip on random python modules.

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

517

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

518

519

>>> import random

520

>>> tempdir = os.path.dirname(f) or os.curdir

521

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

522

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

523

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

524

... testfiles = random.sample(testfiles, 10)

525

...

526

>>> for testfile in testfiles:

527

... if not roundtrip(open(testfile, 'rb')):

528

... print("Roundtrip failed for file %s" % testfile)

529

... break

530

... else: True

531

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

532

533

Evil tabs

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

534

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

535

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

536

ENCODING 'utf-8' (0, 0) (0, 0)

537

NAME 'def' (1, 0) (1, 3)

538

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

543

INDENT '\\t' (2, 0) (2, 1)

544

NAME 'if' (2, 1) (2, 3)

545

NAME 'x' (2, 4) (2, 5)

546

NEWLINE '\\n' (2, 5) (2, 6)

547

INDENT ' \\t' (3, 0) (3, 9)

548

NAME 'pass' (3, 9) (3, 13)

549

DEDENT '' (4, 0) (4, 0)

550

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

551

552

Non-ascii identifiers

553

554

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

555

ENCODING 'utf-8' (0, 0) (0, 0)

556

NAME 'Örter' (1, 0) (1, 5)

557

OP '=' (1, 6) (1, 7)

558

STRING "'places'" (1, 8) (1, 16)

559

NEWLINE '\\n' (1, 16) (1, 17)

560

NAME 'grün' (2, 0) (2, 4)

561

OP '=' (2, 5) (2, 6)

562

STRING "'green'" (2, 7) (2, 14)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

563

"""

564

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

565

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

566

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

567

STRING, ENDMARKER, tok_name, detect_encoding)

568

from io import BytesIO

569

from unittest import TestCase

570

import os, sys, glob

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

571

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

572

def dump_tokens(s):

573

"""Print out the tokens in s in a table format.

574

575

The ENDMARKER is omitted.

576

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

577

f = BytesIO(s.encode('utf-8'))

578

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

579

if type == ENDMARKER:

580

break

581

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

582

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

583

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

584

def roundtrip(f):

585

"""

586

Test roundtrip for `untokenize`. `f` is an open file or a string.

587

The source code in f is tokenized, converted back to source code via

588

tokenize.untokenize(), and tokenized again from the latter. The test

589

fails if the second tokenization doesn't match the first.

590

"""

591

if isinstance(f, str):

592

f = BytesIO(f.encode('utf-8'))

Brian Curtin

9f5f65c

2010-10-30 21:35:28 +0000

[diff] [blame]

593

try:

594

token_list = list(tokenize(f.readline))

595

finally:

596

f.close()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

597

tokens1 = [tok[:2] for tok in token_list]

598

new_bytes = untokenize(tokens1)

599

readline = (line for line in new_bytes.splitlines(1)).__next__

600

tokens2 = [tok[:2] for tok in tokenize(readline)]

601

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

602

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

603

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

604

def decistmt(s):

605

"""Substitute Decimals for floats in a string of statements.

606

607

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

608

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

609

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

610

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

611

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

612

The format of the exponent is inherited from the platform C library.

613

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

614

we're only showing 11 digits, and the 12th isn't close to 5, the

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

615

rest of the output should be platform-independent.

616

617

>>> exec(s) #doctest: +ELLIPSIS

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

618

-3.2171603427...e-0...7

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

619

620

Output from calculations with Decimal should be identical across all

621

platforms.

622

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

623

>>> exec(decistmt(s))

624

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

625

"""

626

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

627

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

628

for toknum, tokval, _, _, _ in g:

629

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

638

return untokenize(result).decode('utf-8')

639

640

641

class TestTokenizerAdheresToPep0263(TestCase):

642

"""

643

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

644

"""

645

646

def _testFile(self, filename):

647

path = os.path.join(os.path.dirname(__file__), filename)

648

return roundtrip(open(path, 'rb'))

649

650

def test_utf8_coding_cookie_and_no_utf8_bom(self):

651

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

652

self.assertTrue(self._testFile(f))

653

654

def test_latin1_coding_cookie_and_utf8_bom(self):

655

"""

656

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

657

allowed encoding for the comment is 'utf-8'. The text file used in

658

this test starts with a BOM signature, but specifies latin1 as the

659

coding, so verify that a SyntaxError is raised, which matches the

660

behaviour of the interpreter when it encounters a similar condition.

661

"""

662

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

663

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

664

665

def test_no_coding_cookie_and_utf8_bom(self):

666

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

667

self.assertTrue(self._testFile(f))

668

669

def test_utf8_coding_cookie_and_utf8_bom(self):

670

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

671

self.assertTrue(self._testFile(f))

672

673

674

class Test_Tokenize(TestCase):

675

676

def test__tokenize_decodes_with_specified_encoding(self):

677

literal = '"ЉЊЈЁЂ"'

678

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

689

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

690

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

691

self.assertEquals(tokens, expected_tokens,

692

"bytes not decoded with encoding")

693

694

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

707

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

708

self.assertEquals(tokens, expected_tokens,

709

"string not tokenized when encoding is None")

710

711

712

class TestDetectEncoding(TestCase):

713

714

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

726

lines = (

727

b'# something\n',

728

b'print(something)\n',

729

b'do_something(else)\n'

730

)

731

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

732

self.assertEquals(encoding, 'utf-8')

733

self.assertEquals(consumed_lines, list(lines[:2]))

734

735

def test_bom_no_cookie(self):

736

lines = (

737

b'\xef\xbb\xbf# something\n',

738

b'print(something)\n',

739

b'do_something(else)\n'

740

)

741

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

742

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

743

self.assertEquals(consumed_lines,

744

[b'# something\n', b'print(something)\n'])

745

746

def test_cookie_first_line_no_bom(self):

747

lines = (

748

b'# -*- coding: latin-1 -*-\n',

749

b'print(something)\n',

750

b'do_something(else)\n'

751

)

752

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

753

self.assertEquals(encoding, 'iso-8859-1')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

754

self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

755

756

def test_matched_bom_and_cookie_first_line(self):

757

lines = (

758

b'\xef\xbb\xbf# coding=utf-8\n',

759

b'print(something)\n',

760

b'do_something(else)\n'

761

)

762

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

763

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

764

self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])

765

766

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

767

lines = (

768

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

769

b'print(something)\n',

770

b'do_something(else)\n'

771

)

772

readline = self.get_readline(lines)

773

self.assertRaises(SyntaxError, detect_encoding, readline)

774

775

def test_cookie_second_line_no_bom(self):

776

lines = (

777

b'#! something\n',

778

b'# vim: set fileencoding=ascii :\n',

779

b'print(something)\n',

780

b'do_something(else)\n'

781

)

782

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

783

self.assertEquals(encoding, 'ascii')

784

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

785

self.assertEquals(consumed_lines, expected)

786

787

def test_matched_bom_and_cookie_second_line(self):

788

lines = (

789

b'\xef\xbb\xbf#! something\n',

790

b'f# coding=utf-8\n',

791

b'print(something)\n',

792

b'do_something(else)\n'

793

)

794

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

795

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

796

self.assertEquals(consumed_lines,

797

[b'#! something\n', b'f# coding=utf-8\n'])

798

799

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

800

lines = (

801

b'\xef\xbb\xbf#! something\n',

802

b'# vim: set fileencoding=ascii :\n',

803

b'print(something)\n',

804

b'do_something(else)\n'

805

)

806

readline = self.get_readline(lines)

807

self.assertRaises(SyntaxError, detect_encoding, readline)

808

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

809

def test_latin1_normalization(self):

810

# See get_normal_name() in tokenizer.c.

811

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

812

"iso-8859-1-unix", "iso-latin-1-mac")

813

for encoding in encodings:

814

for rep in ("-", "_"):

815

enc = encoding.replace("-", rep)

816

lines = (b"#!/usr/bin/python\n",

817

b"# coding: " + enc.encode("ascii") + b"\n",

818

b"print(things)\n",

819

b"do_something += 4\n")

820

rl = self.get_readline(lines)

821

found, consumed_lines = detect_encoding(rl)

822

self.assertEquals(found, "iso-8859-1")

823

824

def test_utf8_normalization(self):

825

# See get_normal_name() in tokenizer.c.

826

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

827

for encoding in encodings:

828

for rep in ("-", "_"):

829

enc = encoding.replace("-", rep)

830

lines = (b"#!/usr/bin/python\n",

831

b"# coding: " + enc.encode("ascii") + b"\n",

832

b"1 + 3\n")

833

rl = self.get_readline(lines)

834

found, consumed_lines = detect_encoding(rl)

835

self.assertEquals(found, "utf-8")

836

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

837

def test_short_files(self):

838

readline = self.get_readline((b'print(something)\n',))

839

encoding, consumed_lines = detect_encoding(readline)

840

self.assertEquals(encoding, 'utf-8')

841

self.assertEquals(consumed_lines, [b'print(something)\n'])

842

843

encoding, consumed_lines = detect_encoding(self.get_readline(()))

844

self.assertEquals(encoding, 'utf-8')

845

self.assertEquals(consumed_lines, [])

846

847

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

848

encoding, consumed_lines = detect_encoding(readline)

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

849

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

850

self.assertEquals(consumed_lines, [b'print(something)\n'])

851

852

readline = self.get_readline((b'\xef\xbb\xbf',))

853

encoding, consumed_lines = detect_encoding(readline)

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

854

self.assertEquals(encoding, 'utf-8-sig')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

855

self.assertEquals(consumed_lines, [])

856

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

857

readline = self.get_readline((b'# coding: bad\n',))

858

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

859

860

class TestTokenize(TestCase):

861

862

def test_tokenize(self):

863

import tokenize as tokenize_module

864

encoding = object()

865

encoding_used = None

866

def mock_detect_encoding(readline):

867

return encoding, ['first', 'second']

868

869

def mock__tokenize(readline, encoding):

870

nonlocal encoding_used

871

encoding_used = encoding

872

out = []

873

while True:

874

next_line = readline()

875

if next_line:

876

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

889

orig__tokenize = tokenize_module._tokenize

890

tokenize_module.detect_encoding = mock_detect_encoding

891

tokenize_module._tokenize = mock__tokenize

892

try:

893

results = tokenize(mock_readline)

894

self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])

895

finally:

896

tokenize_module.detect_encoding = orig_detect_encoding

897

tokenize_module._tokenize = orig__tokenize

898

899

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

900

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

901

902

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

903

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

904

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

905

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

906

support.run_doctest(test_tokenize, True)

907

support.run_unittest(TestTokenizerAdheresToPep0263)

908

support.run_unittest(Test_Tokenize)

909

support.run_unittest(TestDetectEncoding)

910

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

911

Thomas Wouters