Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

2008-03-18 22:41:35 +0000

[diff] [blame]

1

# -*- coding: utf-8 -*-

2

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

3

doctests = """

4

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

5

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

6

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

7

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

8

brevity.

9

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

11

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

12

NUMBER '1' (1, 0) (1, 1)

13

OP '+' (1, 2) (1, 3)

14

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

15

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

16

>>> dump_tokens("if False:\\n"

17

... " # NL\\n"

18

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

19

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

20

NAME 'if' (1, 0) (1, 2)

21

NAME 'False' (1, 3) (1, 8)

22

OP ':' (1, 8) (1, 9)

23

NEWLINE '\\n' (1, 9) (1, 10)

24

COMMENT '# NL' (2, 4) (2, 8)

25

NL '\\n' (2, 8) (2, 9)

26

INDENT ' ' (3, 0) (3, 4)

27

NAME 'True' (3, 4) (3, 8)

28

OP '=' (3, 9) (3, 10)

29

NAME 'False' (3, 11) (3, 16)

30

COMMENT '# NEWLINE' (3, 17) (3, 26)

31

NEWLINE '\\n' (3, 26) (3, 27)

32

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

33

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

34

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

39

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

40

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

41

Traceback (most recent call last):

42

...

43

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

There are some standard formattig practises that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

46

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

47

>>> roundtrip("if x == 1:\\n"

48

... " print(x)\\n")

49

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

50

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

51

>>> roundtrip("# This is a comment\\n# This also")

52

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

53

54

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

55

untokenize a little trickier. Note that this test involves trailing

56

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

57

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

58

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

59

>>> roundtrip("if x == 1 : \\n"

60

... " print(x)\\n")

61

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

62

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

63

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

64

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

66

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

67

>>> roundtrip("if x == 1:\\n"

68

... " # A comment by itself.\\n"

69

... " print(x) # Comment here, too.\\n"

70

... " # Another comment.\\n"

71

... "after_if = True\\n")

72

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

73

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

74

>>> roundtrip("if (x # The comments need to go in the right place\\n"

75

... " == 1):\\n"

76

... " print('x==1')\\n")

77

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

78

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

79

>>> roundtrip("class Test: # A comment here\\n"

80

... " # A comment with weird indent\\n"

81

... " after_com = 5\\n"

82

... " def x(m): return m*5 # a one liner\\n"

83

... " def y(m): # A whitespace after the colon\\n"

84

... " return y*4 # 3-space indent\\n")

85

True

86

87

Some error-handling code

88

89

>>> roundtrip("try: import somemodule\\n"

90

... "except ImportError: # comment\\n"

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

91

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

92

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

93

True

94

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

95

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

96

97

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

102

... "'b':15, 'c':True}\\n"

103

... "x = len(y) + 5 - a[\\n"

104

... "3] - a[2]\\n"

105

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

110

111

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

112

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

113

NUMBER '0xff' (1, 0) (1, 4)

114

OP '<=' (1, 5) (1, 7)

115

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

117

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

118

NUMBER '0b10' (1, 0) (1, 4)

119

OP '<=' (1, 5) (1, 7)

120

NUMBER '255' (1, 8) (1, 11)

121

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

122

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

123

NUMBER '0o123' (1, 0) (1, 5)

124

OP '<=' (1, 6) (1, 8)

125

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

127

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

128

NUMBER '1234567' (1, 0) (1, 7)

129

OP '>' (1, 8) (1, 9)

130

OP '~' (1, 10) (1, 11)

131

NUMBER '0x15' (1, 11) (1, 15)

132

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

133

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

134

NUMBER '2134568' (1, 0) (1, 7)

135

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

136

NUMBER '1231515' (1, 11) (1, 18)

137

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

138

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

139

OP '(' (1, 0) (1, 1)

140

OP '-' (1, 1) (1, 2)

141

NUMBER '124561' (1, 2) (1, 8)

142

OP '-' (1, 8) (1, 9)

143

NUMBER '1' (1, 9) (1, 10)

144

OP ')' (1, 10) (1, 11)

145

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

146

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

148

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

149

NUMBER '0xdeadbeef' (1, 0) (1, 10)

150

OP '!=' (1, 11) (1, 13)

151

OP '-' (1, 14) (1, 15)

152

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

153

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

154

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

155

NUMBER '0xdeadc0de' (1, 0) (1, 10)

156

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

157

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

159

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

160

NUMBER '0xFF' (1, 0) (1, 4)

161

OP '&' (1, 5) (1, 6)

162

NUMBER '0x15' (1, 7) (1, 11)

163

OP '|' (1, 12) (1, 13)

164

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

168

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

169

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

170

NAME 'x' (1, 0) (1, 1)

171

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

172

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

174

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

175

NAME 'x' (1, 0) (1, 1)

176

OP '=' (1, 2) (1, 3)

177

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

178

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

179

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

180

NAME 'x' (1, 0) (1, 1)

181

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

182

NUMBER '123141242151 (1, 4) (1, 25)

183

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

184

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

185

NAME 'x' (1, 0) (1, 1)

186

OP '=' (1, 2) (1, 3)

187

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

188

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

189

190

Floating point numbers

191

192

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

193

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

194

NAME 'x' (1, 0) (1, 1)

195

OP '=' (1, 2) (1, 3)

196

NUMBER '3.14159' (1, 4) (1, 11)

197

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

198

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

199

NAME 'x' (1, 0) (1, 1)

200

OP '=' (1, 2) (1, 3)

201

NUMBER '314159.' (1, 4) (1, 11)

202

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

203

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

204

NAME 'x' (1, 0) (1, 1)

205

OP '=' (1, 2) (1, 3)

206

NUMBER '.314159' (1, 4) (1, 11)

207

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

208

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

209

NAME 'x' (1, 0) (1, 1)

210

OP '=' (1, 2) (1, 3)

211

NUMBER '3e14159' (1, 4) (1, 11)

212

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

213

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

214

NAME 'x' (1, 0) (1, 1)

215

OP '=' (1, 2) (1, 3)

216

NUMBER '3E123' (1, 4) (1, 9)

217

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

218

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

219

NAME 'x' (1, 0) (1, 1)

220

OP '+' (1, 1) (1, 2)

221

NAME 'y' (1, 2) (1, 3)

222

OP '=' (1, 4) (1, 5)

223

NUMBER '3e-1230' (1, 6) (1, 13)

224

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

225

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

226

NAME 'x' (1, 0) (1, 1)

227

OP '=' (1, 2) (1, 3)

228

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

233

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

234

NAME 'x' (1, 0) (1, 1)

235

OP '=' (1, 2) (1, 3)

236

STRING "''" (1, 4) (1, 6)

237

OP ';' (1, 6) (1, 7)

238

NAME 'y' (1, 8) (1, 9)

239

OP '=' (1, 10) (1, 11)

240

STRING '""' (1, 12) (1, 14)

241

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

242

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

243

NAME 'x' (1, 0) (1, 1)

244

OP '=' (1, 2) (1, 3)

245

STRING '\\'"\\'' (1, 4) (1, 7)

246

OP ';' (1, 7) (1, 8)

247

NAME 'y' (1, 9) (1, 10)

248

OP '=' (1, 11) (1, 12)

249

STRING '"\\'"' (1, 13) (1, 16)

250

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

252

NAME 'x' (1, 0) (1, 1)

253

OP '=' (1, 2) (1, 3)

254

STRING '"doesn\\'t "' (1, 4) (1, 14)

255

NAME 'shrink' (1, 14) (1, 20)

256

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

257

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

258

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

259

NAME 'x' (1, 0) (1, 1)

260

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

261

STRING "'abc'" (1, 4) (1, 9)

262

OP '+' (1, 10) (1, 11)

263

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

265

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

266

NAME 'y' (1, 0) (1, 1)

267

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

268

STRING '"ABC"' (1, 4) (1, 9)

269

OP '+' (1, 10) (1, 11)

270

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

272

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

273

NAME 'x' (1, 0) (1, 1)

274

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

275

STRING "r'abc'" (1, 4) (1, 10)

276

OP '+' (1, 11) (1, 12)

277

STRING "r'ABC'" (1, 13) (1, 19)

278

OP '+' (1, 20) (1, 21)

279

STRING "R'ABC'" (1, 22) (1, 28)

280

OP '+' (1, 29) (1, 30)

281

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

284

NAME 'y' (1, 0) (1, 1)

285

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

286

STRING 'r"abc"' (1, 4) (1, 10)

287

OP '+' (1, 11) (1, 12)

288

STRING 'r"ABC"' (1, 13) (1, 19)

289

OP '+' (1, 20) (1, 21)

290

STRING 'R"ABC"' (1, 22) (1, 28)

291

OP '+' (1, 29) (1, 30)

292

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Operators

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

298

NAME 'def' (1, 0) (1, 3)

299

NAME 'd22' (1, 4) (1, 7)

300

OP '(' (1, 7) (1, 8)

301

NAME 'a' (1, 8) (1, 9)

302

OP ',' (1, 9) (1, 10)

303

NAME 'b' (1, 11) (1, 12)

304

OP ',' (1, 12) (1, 13)

305

NAME 'c' (1, 14) (1, 15)

306

OP '=' (1, 15) (1, 16)

307

NUMBER '2' (1, 16) (1, 17)

308

OP ',' (1, 17) (1, 18)

309

NAME 'd' (1, 19) (1, 20)

310

OP '=' (1, 20) (1, 21)

311

NUMBER '2' (1, 21) (1, 22)

312

OP ',' (1, 22) (1, 23)

313

OP '*' (1, 24) (1, 25)

314

NAME 'k' (1, 25) (1, 26)

315

OP ')' (1, 26) (1, 27)

316

OP ':' (1, 27) (1, 28)

317

NAME 'pass' (1, 29) (1, 33)

318

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

320

NAME 'def' (1, 0) (1, 3)

321

NAME 'd01v_' (1, 4) (1, 9)

322

OP '(' (1, 9) (1, 10)

323

NAME 'a' (1, 10) (1, 11)

324

OP '=' (1, 11) (1, 12)

325

NUMBER '1' (1, 12) (1, 13)

326

OP ',' (1, 13) (1, 14)

327

OP '*' (1, 15) (1, 16)

328

NAME 'k' (1, 16) (1, 17)

329

OP ',' (1, 17) (1, 18)

330

OP '**' (1, 19) (1, 21)

331

NAME 'w' (1, 21) (1, 22)

332

OP ')' (1, 22) (1, 23)

333

OP ':' (1, 23) (1, 24)

334

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

339

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

341

NAME 'if' (1, 0) (1, 2)

342

NUMBER '1' (1, 3) (1, 4)

343

OP '<' (1, 5) (1, 6)

344

NUMBER '1' (1, 7) (1, 8)

345

OP '>' (1, 9) (1, 10)

346

NUMBER '1' (1, 11) (1, 12)

347

OP '==' (1, 13) (1, 15)

348

NUMBER '1' (1, 16) (1, 17)

349

OP '>=' (1, 18) (1, 20)

350

NUMBER '5' (1, 21) (1, 22)

351

OP '<=' (1, 23) (1, 25)

352

NUMBER '0x15' (1, 26) (1, 30)

353

OP '<=' (1, 31) (1, 33)

354

NUMBER '0x12' (1, 34) (1, 38)

355

OP '!=' (1, 39) (1, 41)

356

NUMBER '1' (1, 42) (1, 43)

357

NAME 'and' (1, 44) (1, 47)

358

NUMBER '5' (1, 48) (1, 49)

359

NAME 'in' (1, 50) (1, 52)

360

NUMBER '1' (1, 53) (1, 54)

361

NAME 'not' (1, 55) (1, 58)

362

NAME 'in' (1, 59) (1, 61)

363

NUMBER '1' (1, 62) (1, 63)

364

NAME 'is' (1, 64) (1, 66)

365

NUMBER '1' (1, 67) (1, 68)

366

NAME 'or' (1, 69) (1, 71)

367

NUMBER '5' (1, 72) (1, 73)

368

NAME 'is' (1, 74) (1, 76)

369

NAME 'not' (1, 77) (1, 80)

370

NUMBER '1' (1, 81) (1, 82)

371

OP ':' (1, 82) (1, 83)

372

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

378

NAME 'x' (1, 0) (1, 1)

379

OP '=' (1, 2) (1, 3)

380

NUMBER '1' (1, 4) (1, 5)

381

OP '<<' (1, 6) (1, 8)

382

NUMBER '1' (1, 9) (1, 10)

383

OP '>>' (1, 11) (1, 13)

384

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

388

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

389

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

390

NAME 'x' (1, 0) (1, 1)

391

OP '=' (1, 2) (1, 3)

392

NUMBER '1' (1, 4) (1, 5)

393

OP '-' (1, 6) (1, 7)

394

NAME 'y' (1, 8) (1, 9)

395

OP '+' (1, 10) (1, 11)

396

NUMBER '15' (1, 12) (1, 14)

397

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

398

NUMBER '1' (1, 17) (1, 18)

399

OP '+' (1, 19) (1, 20)

400

NUMBER '0x124' (1, 21) (1, 26)

401

OP '+' (1, 27) (1, 28)

402

NAME 'z' (1, 29) (1, 30)

403

OP '+' (1, 31) (1, 32)

404

NAME 'a' (1, 33) (1, 34)

405

OP '[' (1, 34) (1, 35)

406

NUMBER '5' (1, 35) (1, 36)

407

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

412

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

413

NAME 'x' (1, 0) (1, 1)

414

OP '=' (1, 2) (1, 3)

415

NUMBER '1' (1, 4) (1, 5)

416

OP '//' (1, 5) (1, 7)

417

NUMBER '1' (1, 7) (1, 8)

418

OP '*' (1, 8) (1, 9)

419

NUMBER '1' (1, 9) (1, 10)

420

OP '/' (1, 10) (1, 11)

421

NUMBER '5' (1, 11) (1, 12)

422

OP '*' (1, 12) (1, 13)

423

NUMBER '12' (1, 13) (1, 15)

424

OP '%' (1, 15) (1, 16)

425

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

430

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

431

OP '~' (1, 0) (1, 1)

432

NUMBER '1' (1, 1) (1, 2)

433

OP '^' (1, 3) (1, 4)

434

NUMBER '1' (1, 5) (1, 6)

435

OP '&' (1, 7) (1, 8)

436

NUMBER '1' (1, 9) (1, 10)

437

OP '|' (1, 11) (1, 12)

438

NUMBER '1' (1, 12) (1, 13)

439

OP '^' (1, 14) (1, 15)

440

OP '-' (1, 16) (1, 17)

441

NUMBER '1' (1, 17) (1, 18)

442

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

443

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

444

OP '-' (1, 0) (1, 1)

445

NUMBER '1' (1, 1) (1, 2)

446

OP '*' (1, 2) (1, 3)

447

NUMBER '1' (1, 3) (1, 4)

448

OP '/' (1, 4) (1, 5)

449

NUMBER '1' (1, 5) (1, 6)

450

OP '+' (1, 6) (1, 7)

451

NUMBER '1' (1, 7) (1, 8)

452

OP '*' (1, 8) (1, 9)

453

NUMBER '1' (1, 9) (1, 10)

454

OP '//' (1, 10) (1, 12)

455

NUMBER '1' (1, 12) (1, 13)

456

OP '-' (1, 14) (1, 15)

457

OP '-' (1, 16) (1, 17)

458

OP '-' (1, 17) (1, 18)

459

OP '-' (1, 18) (1, 19)

460

NUMBER '1' (1, 19) (1, 20)

461

OP '**' (1, 20) (1, 22)

462

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

467

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

468

NAME 'import' (1, 0) (1, 6)

469

NAME 'sys' (1, 7) (1, 10)

470

OP ',' (1, 10) (1, 11)

471

NAME 'time' (1, 12) (1, 16)

472

NEWLINE '\\n' (1, 16) (1, 17)

473

NAME 'x' (2, 0) (2, 1)

474

OP '=' (2, 2) (2, 3)

475

NAME 'sys' (2, 4) (2, 7)

476

OP '.' (2, 7) (2, 8)

477

NAME 'modules' (2, 8) (2, 15)

478

OP '[' (2, 15) (2, 16)

479

STRING "'time'" (2, 16) (2, 22)

480

OP ']' (2, 22) (2, 23)

481

OP '.' (2, 23) (2, 24)

482

NAME 'time' (2, 24) (2, 28)

483

OP '(' (2, 28) (2, 29)

484

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

489

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

490

OP '@' (1, 0) (1, 1)

491

NAME 'staticmethod (1, 1) (1, 13)

492

NEWLINE '\\n' (1, 13) (1, 14)

493

NAME 'def' (2, 0) (2, 3)

494

NAME 'foo' (2, 4) (2, 7)

495

OP '(' (2, 7) (2, 8)

496

NAME 'x' (2, 8) (2, 9)

497

OP ',' (2, 9) (2, 10)

498

NAME 'y' (2, 10) (2, 11)

499

OP ')' (2, 11) (2, 12)

500

OP ':' (2, 12) (2, 13)

501

NAME 'pass' (2, 14) (2, 18)

502

503

Backslash means line continuation, except for comments

504

505

>>> roundtrip("x=1+\\\\n"

506

... "1\\n"

507

... "# This is a comment\\\\n"

508

... "# This also\\n")

509

True

510

>>> roundtrip("# Comment \\\\nx = 0")

511

True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

512

513

Two string literals on the same line

514

515

>>> roundtrip("'' ''")

516

True

517

518

Test roundtrip on random python modules.

519

pass the '-ucompiler' option to process the full directory.

520

521

>>> import random

522

>>> tempdir = os.path.dirname(f) or os.curdir

523

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

524

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

525

>>> if not support.is_resource_enabled("compiler"):

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

526

... testfiles = random.sample(testfiles, 10)

527

...

528

>>> for testfile in testfiles:

529

... if not roundtrip(open(testfile, 'rb')):

530

... print("Roundtrip failed for file %s" % testfile)

531

... break

532

... else: True

533

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

534

"""

535

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

536

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

537

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

538

STRING, ENDMARKER, tok_name, detect_encoding)

539

from io import BytesIO

540

from unittest import TestCase

541

import os, sys, glob

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

542

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

543

def dump_tokens(s):

544

"""Print out the tokens in s in a table format.

545

546

The ENDMARKER is omitted.

547

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

548

f = BytesIO(s.encode('utf-8'))

549

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

550

if type == ENDMARKER:

551

break

552

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

553

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

554

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

555

def roundtrip(f):

556

"""

557

Test roundtrip for `untokenize`. `f` is an open file or a string.

558

The source code in f is tokenized, converted back to source code via

559

tokenize.untokenize(), and tokenized again from the latter. The test

560

fails if the second tokenization doesn't match the first.

561

"""

562

if isinstance(f, str):

563

f = BytesIO(f.encode('utf-8'))

564

token_list = list(tokenize(f.readline))

565

f.close()

566

tokens1 = [tok[:2] for tok in token_list]

567

new_bytes = untokenize(tokens1)

568

readline = (line for line in new_bytes.splitlines(1)).__next__

569

tokens2 = [tok[:2] for tok in tokenize(readline)]

570

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

571

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

572

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

573

def decistmt(s):

574

"""Substitute Decimals for floats in a string of statements.

575

576

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

577

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

578

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

579

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

580

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

581

The format of the exponent is inherited from the platform C library.

582

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

583

we're only showing 12 digits, and the 13th isn't close to 5, the

584

rest of the output should be platform-independent.

585

586

>>> exec(s) #doctest: +ELLIPSIS

587

-3.21716034272e-0...7

588

589

Output from calculations with Decimal should be identical across all

590

platforms.

591

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

592

>>> exec(decistmt(s))

593

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

594

"""

595

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

596

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

597

for toknum, tokval, _, _, _ in g:

598

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

607

return untokenize(result).decode('utf-8')

608

609

610

class TestTokenizerAdheresToPep0263(TestCase):

611

"""

612

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

613

"""

614

615

def _testFile(self, filename):

616

path = os.path.join(os.path.dirname(__file__), filename)

617

return roundtrip(open(path, 'rb'))

618

619

def test_utf8_coding_cookie_and_no_utf8_bom(self):

620

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

621

self.assertTrue(self._testFile(f))

622

623

def test_latin1_coding_cookie_and_utf8_bom(self):

624

"""

625

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

626

allowed encoding for the comment is 'utf-8'. The text file used in

627

this test starts with a BOM signature, but specifies latin1 as the

628

coding, so verify that a SyntaxError is raised, which matches the

629

behaviour of the interpreter when it encounters a similar condition.

630

"""

631

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Georg Brandl

ab91fde

2009-08-13 08:51:18 +0000

[diff] [blame]

632

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

633

634

def test_no_coding_cookie_and_utf8_bom(self):

635

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

636

self.assertTrue(self._testFile(f))

637

638

def test_utf8_coding_cookie_and_utf8_bom(self):

639

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

640

self.assertTrue(self._testFile(f))

641

642

643

class Test_Tokenize(TestCase):

644

645

def test__tokenize_decodes_with_specified_encoding(self):

646

literal = '"ЉЊЈЁЂ"'

647

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

658

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

659

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

660

self.assertEquals(tokens, expected_tokens,

661

"bytes not decoded with encoding")

662

663

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

676

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

677

self.assertEquals(tokens, expected_tokens,

678

"string not tokenized when encoding is None")

679

680

681

class TestDetectEncoding(TestCase):

682

683

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

695

lines = (

696

b'# something\n',

697

b'print(something)\n',

698

b'do_something(else)\n'

699

)

700

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

701

self.assertEquals(encoding, 'utf-8')

702

self.assertEquals(consumed_lines, list(lines[:2]))

703

704

def test_bom_no_cookie(self):

705

lines = (

706

b'\xef\xbb\xbf# something\n',

707

b'print(something)\n',

708

b'do_something(else)\n'

709

)

710

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

711

self.assertEquals(encoding, 'utf-8')

712

self.assertEquals(consumed_lines,

713

[b'# something\n', b'print(something)\n'])

714

715

def test_cookie_first_line_no_bom(self):

716

lines = (

717

b'# -*- coding: latin-1 -*-\n',

718

b'print(something)\n',

719

b'do_something(else)\n'

720

)

721

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Benjamin Peterson

0c7f9c9

2009-10-09 21:53:27 +0000

[diff] [blame]

722

self.assertEquals(encoding, 'iso-8859-1')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

723

self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

724

725

def test_matched_bom_and_cookie_first_line(self):

726

lines = (

727

b'\xef\xbb\xbf# coding=utf-8\n',

728

b'print(something)\n',

729

b'do_something(else)\n'

730

)

731

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

732

self.assertEquals(encoding, 'utf-8')

733

self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])

734

735

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

736

lines = (

737

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

738

b'print(something)\n',

739

b'do_something(else)\n'

740

)

741

readline = self.get_readline(lines)

742

self.assertRaises(SyntaxError, detect_encoding, readline)

743

744

def test_cookie_second_line_no_bom(self):

745

lines = (

746

b'#! something\n',

747

b'# vim: set fileencoding=ascii :\n',

748

b'print(something)\n',

749

b'do_something(else)\n'

750

)

751

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

752

self.assertEquals(encoding, 'ascii')

753

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

754

self.assertEquals(consumed_lines, expected)

755

756

def test_matched_bom_and_cookie_second_line(self):

757

lines = (

758

b'\xef\xbb\xbf#! something\n',

759

b'f# coding=utf-8\n',

760

b'print(something)\n',

761

b'do_something(else)\n'

762

)

763

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

764

self.assertEquals(encoding, 'utf-8')

765

self.assertEquals(consumed_lines,

766

[b'#! something\n', b'f# coding=utf-8\n'])

767

768

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

769

lines = (

770

b'\xef\xbb\xbf#! something\n',

771

b'# vim: set fileencoding=ascii :\n',

772

b'print(something)\n',

773

b'do_something(else)\n'

774

)

775

readline = self.get_readline(lines)

776

self.assertRaises(SyntaxError, detect_encoding, readline)

777

Benjamin Peterson

0c7f9c9

2009-10-09 21:53:27 +0000

[diff] [blame]

778

def test_latin1_normalization(self):

779

# See get_normal_name() in tokenizer.c.

780

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

781

"iso-8859-1-unix", "iso-latin-1-mac")

782

for encoding in encodings:

783

for rep in ("-", "_"):

784

enc = encoding.replace("-", rep)

785

lines = (b"#!/usr/bin/python\n",

786

b"# coding: " + enc.encode("ascii") + b"\n",

787

b"print(things)\n",

788

b"do_something += 4\n")

789

rl = self.get_readline(lines)

790

found, consumed_lines = detect_encoding(rl)

791

self.assertEquals(found, "iso-8859-1")

792

793

def test_utf8_normalization(self):

794

# See get_normal_name() in tokenizer.c.

795

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

796

for encoding in encodings:

797

for rep in ("-", "_"):

798

enc = encoding.replace("-", rep)

799

lines = (b"#!/usr/bin/python\n",

800

b"# coding: " + enc.encode("ascii") + b"\n",

801

b"1 + 3\n")

802

rl = self.get_readline(lines)

803

found, consumed_lines = detect_encoding(rl)

804

self.assertEquals(found, "utf-8")

805

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

806

def test_short_files(self):

807

readline = self.get_readline((b'print(something)\n',))

808

encoding, consumed_lines = detect_encoding(readline)

809

self.assertEquals(encoding, 'utf-8')

810

self.assertEquals(consumed_lines, [b'print(something)\n'])

811

812

encoding, consumed_lines = detect_encoding(self.get_readline(()))

813

self.assertEquals(encoding, 'utf-8')

814

self.assertEquals(consumed_lines, [])

815

816

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

817

encoding, consumed_lines = detect_encoding(readline)

818

self.assertEquals(encoding, 'utf-8')

819

self.assertEquals(consumed_lines, [b'print(something)\n'])

820

821

readline = self.get_readline((b'\xef\xbb\xbf',))

822

encoding, consumed_lines = detect_encoding(readline)

823

self.assertEquals(encoding, 'utf-8')

824

self.assertEquals(consumed_lines, [])

825

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

826

readline = self.get_readline((b'# coding: bad\n',))

827

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

828

829

class TestTokenize(TestCase):

830

831

def test_tokenize(self):

832

import tokenize as tokenize_module

833

encoding = object()

834

encoding_used = None

835

def mock_detect_encoding(readline):

836

return encoding, ['first', 'second']

837

838

def mock__tokenize(readline, encoding):

839

nonlocal encoding_used

840

encoding_used = encoding

841

out = []

842

while True:

843

next_line = readline()

844

if next_line:

845

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

858

orig__tokenize = tokenize_module._tokenize

859

tokenize_module.detect_encoding = mock_detect_encoding

860

tokenize_module._tokenize = mock__tokenize

861

try:

862

results = tokenize(mock_readline)

863

self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])

864

finally:

865

tokenize_module.detect_encoding = orig_detect_encoding

866

tokenize_module._tokenize = orig__tokenize

867

868

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

869

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

870

871

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

872

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

873

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

874

from test import test_tokenize

Benjamin Peterson

2008-05-20 21:35:26 +0000

[diff] [blame]

875

support.run_doctest(test_tokenize, True)

876

support.run_unittest(TestTokenizerAdheresToPep0263)

877

support.run_unittest(Test_Tokenize)

878

support.run_unittest(TestDetectEncoding)

879

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

880

Thomas Wouters