Blame - Lib/test/test_tokenize.py - platform/external/python/cpython2

2008-03-16 00:07:10 +0000

[diff] [blame]

1

doctests = """

2

Tests for the tokenize module.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

3

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

4

The tests can be really simple. Given a small fragment of source

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

5

code, print out a table with tokens. The ENDMARK is omitted for

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

6

brevity.

7

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

8

>>> dump_tokens("1 + 1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

9

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

10

NUMBER '1' (1, 0) (1, 1)

11

OP '+' (1, 2) (1, 3)

12

NUMBER '1' (1, 4) (1, 5)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

13

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

14

>>> dump_tokens("if False:\\n"

15

... " # NL\\n"

16

... " True = False # NEWLINE\\n")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

17

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

18

NAME 'if' (1, 0) (1, 2)

19

NAME 'False' (1, 3) (1, 8)

20

OP ':' (1, 8) (1, 9)

21

NEWLINE '\\n' (1, 9) (1, 10)

22

COMMENT '# NL' (2, 4) (2, 8)

23

NL '\\n' (2, 8) (2, 9)

24

INDENT ' ' (3, 0) (3, 4)

25

NAME 'True' (3, 4) (3, 8)

26

OP '=' (3, 9) (3, 10)

27

NAME 'False' (3, 11) (3, 16)

28

COMMENT '# NEWLINE' (3, 17) (3, 26)

29

NEWLINE '\\n' (3, 26) (3, 27)

30

DEDENT '' (4, 0) (4, 0)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

31

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

32

>>> indent_error_file = \"""

... def k(x):

... x += 2

... x += 5

... \"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

37

>>> readline = BytesIO(indent_error_file.encode('utf-8')).readline

38

>>> for tok in tokenize(readline): pass

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

39

Traceback (most recent call last):

40

...

41

IndentationError: unindent does not match any outer indentation level

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

42

Mark Dickinson

3c0b317

2010-06-29 07:38:37 +0000

[diff] [blame]

43

There are some standard formatting practices that are easy to get right.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

44

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

45

>>> roundtrip("if x == 1:\\n"

46

... " print(x)\\n")

47

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

48

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

49

>>> roundtrip("# This is a comment\\n# This also")

50

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

51

52

Some people use different formatting conventions, which makes

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

53

untokenize a little trickier. Note that this test involves trailing

54

whitespace after the colon. Note that we use hex escapes to make the

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

55

two trailing blanks apparent in the expected output.

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

56

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

57

>>> roundtrip("if x == 1 : \\n"

58

... " print(x)\\n")

59

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

60

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

61

>>> f = support.findfile("tokenize_tests.txt")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

62

>>> roundtrip(open(f, 'rb'))

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

63

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

64

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

65

>>> roundtrip("if x == 1:\\n"

66

... " # A comment by itself.\\n"

67

... " print(x) # Comment here, too.\\n"

68

... " # Another comment.\\n"

69

... "after_if = True\\n")

70

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

71

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

72

>>> roundtrip("if (x # The comments need to go in the right place\\n"

73

... " == 1):\\n"

74

... " print('x==1')\\n")

75

True

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

76

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

77

>>> roundtrip("class Test: # A comment here\\n"

78

... " # A comment with weird indent\\n"

79

... " after_com = 5\\n"

80

... " def x(m): return m*5 # a one liner\\n"

81

... " def y(m): # A whitespace after the colon\\n"

82

... " return y*4 # 3-space indent\\n")

83

True

84

85

Some error-handling code

86

87

>>> roundtrip("try: import somemodule\\n"

88

... "except ImportError: # comment\\n"

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

89

... " print('Can not import' # comment2\\n)"

Neal Norwitz

752abd0

2008-05-13 04:55:24 +0000

[diff] [blame]

90

... "else: print('Loaded')\\n")

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

91

True

92

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

93

Balancing continuation

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

94

95

>>> roundtrip("a = (3,4, \\n"

... "5,6)\\n"

... "y = [3, 4,\\n"

... "5]\\n"

... "z = {'a': 5,\\n"

100

... "'b':15, 'c':True}\\n"

101

... "x = len(y) + 5 - a[\\n"

102

... "3] - a[2]\\n"

103

... "+ len(z) - z[\\n"

... "'b']\\n")

True

Ordinary integers and binary operators

108

109

>>> dump_tokens("0xff <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

110

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

111

NUMBER '0xff' (1, 0) (1, 4)

112

OP '<=' (1, 5) (1, 7)

113

NUMBER '255' (1, 8) (1, 11)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

114

>>> dump_tokens("0b10 <= 255")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

115

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

116

NUMBER '0b10' (1, 0) (1, 4)

117

OP '<=' (1, 5) (1, 7)

118

NUMBER '255' (1, 8) (1, 11)

119

>>> dump_tokens("0o123 <= 0O123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

120

ENCODING 'utf-8' (0, 0) (0, 0)

Eric Smith

2008-03-17 19:49:19 +0000

[diff] [blame]

121

NUMBER '0o123' (1, 0) (1, 5)

122

OP '<=' (1, 6) (1, 8)

123

NUMBER '0O123' (1, 9) (1, 14)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

124

>>> dump_tokens("1234567 > ~0x15")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

125

ENCODING 'utf-8' (0, 0) (0, 0)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

126

NUMBER '1234567' (1, 0) (1, 7)

127

OP '>' (1, 8) (1, 9)

128

OP '~' (1, 10) (1, 11)

129

NUMBER '0x15' (1, 11) (1, 15)

130

>>> dump_tokens("2134568 != 1231515")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

131

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

132

NUMBER '2134568' (1, 0) (1, 7)

133

OP '!=' (1, 8) (1, 10)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

134

NUMBER '1231515' (1, 11) (1, 18)

135

>>> dump_tokens("(-124561-1) & 200000000")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

136

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

137

OP '(' (1, 0) (1, 1)

138

OP '-' (1, 1) (1, 2)

139

NUMBER '124561' (1, 2) (1, 8)

140

OP '-' (1, 8) (1, 9)

141

NUMBER '1' (1, 9) (1, 10)

142

OP ')' (1, 10) (1, 11)

143

OP '&' (1, 12) (1, 13)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

144

NUMBER '200000000' (1, 14) (1, 23)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

145

>>> dump_tokens("0xdeadbeef != -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

146

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

147

NUMBER '0xdeadbeef' (1, 0) (1, 10)

148

OP '!=' (1, 11) (1, 13)

149

OP '-' (1, 14) (1, 15)

150

NUMBER '1' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

151

>>> dump_tokens("0xdeadc0de & 12345")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

152

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

153

NUMBER '0xdeadc0de' (1, 0) (1, 10)

154

OP '&' (1, 11) (1, 12)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

155

NUMBER '12345' (1, 13) (1, 18)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

156

>>> dump_tokens("0xFF & 0x15 | 1234")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

157

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

158

NUMBER '0xFF' (1, 0) (1, 4)

159

OP '&' (1, 5) (1, 6)

160

NUMBER '0x15' (1, 7) (1, 11)

161

OP '|' (1, 12) (1, 13)

162

NUMBER '1234' (1, 14) (1, 18)

Long integers

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

166

>>> dump_tokens("x = 0")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

167

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

168

NAME 'x' (1, 0) (1, 1)

169

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

170

NUMBER '0' (1, 4) (1, 5)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

171

>>> dump_tokens("x = 0xfffffffffff")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

172

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

173

NAME 'x' (1, 0) (1, 1)

174

OP '=' (1, 2) (1, 3)

175

NUMBER '0xffffffffff (1, 4) (1, 17)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

176

>>> dump_tokens("x = 123141242151251616110")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

177

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

178

NAME 'x' (1, 0) (1, 1)

179

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

180

NUMBER '123141242151 (1, 4) (1, 25)

181

>>> dump_tokens("x = -15921590215012591")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

182

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

183

NAME 'x' (1, 0) (1, 1)

184

OP '=' (1, 2) (1, 3)

185

OP '-' (1, 4) (1, 5)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

186

NUMBER '159215902150 (1, 5) (1, 22)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

187

188

Floating point numbers

189

190

>>> dump_tokens("x = 3.14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

191

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

192

NAME 'x' (1, 0) (1, 1)

193

OP '=' (1, 2) (1, 3)

194

NUMBER '3.14159' (1, 4) (1, 11)

195

>>> dump_tokens("x = 314159.")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

196

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

197

NAME 'x' (1, 0) (1, 1)

198

OP '=' (1, 2) (1, 3)

199

NUMBER '314159.' (1, 4) (1, 11)

200

>>> dump_tokens("x = .314159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

201

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

202

NAME 'x' (1, 0) (1, 1)

203

OP '=' (1, 2) (1, 3)

204

NUMBER '.314159' (1, 4) (1, 11)

205

>>> dump_tokens("x = 3e14159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

206

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

207

NAME 'x' (1, 0) (1, 1)

208

OP '=' (1, 2) (1, 3)

209

NUMBER '3e14159' (1, 4) (1, 11)

210

>>> dump_tokens("x = 3E123")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

211

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

212

NAME 'x' (1, 0) (1, 1)

213

OP '=' (1, 2) (1, 3)

214

NUMBER '3E123' (1, 4) (1, 9)

215

>>> dump_tokens("x+y = 3e-1230")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

216

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

217

NAME 'x' (1, 0) (1, 1)

218

OP '+' (1, 1) (1, 2)

219

NAME 'y' (1, 2) (1, 3)

220

OP '=' (1, 4) (1, 5)

221

NUMBER '3e-1230' (1, 6) (1, 13)

222

>>> dump_tokens("x = 3.14e159")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

224

NAME 'x' (1, 0) (1, 1)

225

OP '=' (1, 2) (1, 3)

226

NUMBER '3.14e159' (1, 4) (1, 12)

String literals

>>> dump_tokens("x = ''; y = \\\"\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

232

NAME 'x' (1, 0) (1, 1)

233

OP '=' (1, 2) (1, 3)

234

STRING "''" (1, 4) (1, 6)

235

OP ';' (1, 6) (1, 7)

236

NAME 'y' (1, 8) (1, 9)

237

OP '=' (1, 10) (1, 11)

238

STRING '""' (1, 12) (1, 14)

239

>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

240

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

241

NAME 'x' (1, 0) (1, 1)

242

OP '=' (1, 2) (1, 3)

243

STRING '\\'"\\'' (1, 4) (1, 7)

244

OP ';' (1, 7) (1, 8)

245

NAME 'y' (1, 9) (1, 10)

246

OP '=' (1, 11) (1, 12)

247

STRING '"\\'"' (1, 13) (1, 16)

248

>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

249

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

250

NAME 'x' (1, 0) (1, 1)

251

OP '=' (1, 2) (1, 3)

252

STRING '"doesn\\'t "' (1, 4) (1, 14)

253

NAME 'shrink' (1, 14) (1, 20)

254

STRING '", does it"' (1, 20) (1, 31)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

255

>>> dump_tokens("x = 'abc' + 'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

256

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

257

NAME 'x' (1, 0) (1, 1)

258

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

259

STRING "'abc'" (1, 4) (1, 9)

260

OP '+' (1, 10) (1, 11)

261

STRING "'ABC'" (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

262

>>> dump_tokens('y = "ABC" + "ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

263

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

264

NAME 'y' (1, 0) (1, 1)

265

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

266

STRING '"ABC"' (1, 4) (1, 9)

267

OP '+' (1, 10) (1, 11)

268

STRING '"ABC"' (1, 12) (1, 17)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

269

>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

270

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

271

NAME 'x' (1, 0) (1, 1)

272

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

273

STRING "r'abc'" (1, 4) (1, 10)

274

OP '+' (1, 11) (1, 12)

275

STRING "r'ABC'" (1, 13) (1, 19)

276

OP '+' (1, 20) (1, 21)

277

STRING "R'ABC'" (1, 22) (1, 28)

278

OP '+' (1, 29) (1, 30)

279

STRING "R'ABC'" (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

280

>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

281

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

282

NAME 'y' (1, 0) (1, 1)

283

OP '=' (1, 2) (1, 3)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

284

STRING 'r"abc"' (1, 4) (1, 10)

285

OP '+' (1, 11) (1, 12)

286

STRING 'r"ABC"' (1, 13) (1, 19)

287

OP '+' (1, 20) (1, 21)

288

STRING 'R"ABC"' (1, 22) (1, 28)

289

OP '+' (1, 29) (1, 30)

290

STRING 'R"ABC"' (1, 31) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

291

Meador Inge

8d5c0b8

2012-06-16 21:49:08 -0500

[diff] [blame]

292

>>> dump_tokens("u'abc' + U'abc'")

293

ENCODING 'utf-8' (0, 0) (0, 0)

294

STRING "u'abc'" (1, 0) (1, 6)

295

OP '+' (1, 7) (1, 8)

296

STRING "U'abc'" (1, 9) (1, 15)

297

>>> dump_tokens('u"abc" + U"abc"')

298

ENCODING 'utf-8' (0, 0) (0, 0)

299

STRING 'u"abc"' (1, 0) (1, 6)

300

OP '+' (1, 7) (1, 8)

301

STRING 'U"abc"' (1, 9) (1, 15)

302

>>> dump_tokens("ur'abc' + uR'abc' + Ur'abc' + UR'abc'")

303

ENCODING 'utf-8' (0, 0) (0, 0)

304

STRING "ur'abc'" (1, 0) (1, 7)

305

OP '+' (1, 8) (1, 9)

306

STRING "uR'abc'" (1, 10) (1, 17)

307

OP '+' (1, 18) (1, 19)

308

STRING "Ur'abc'" (1, 20) (1, 27)

309

OP '+' (1, 28) (1, 29)

310

STRING "UR'abc'" (1, 30) (1, 37)

311

>>> dump_tokens('ur"abc" + uR"abc" + Ur"abc" + UR"abc"')

312

ENCODING 'utf-8' (0, 0) (0, 0)

313

STRING 'ur"abc"' (1, 0) (1, 7)

314

OP '+' (1, 8) (1, 9)

315

STRING 'uR"abc"' (1, 10) (1, 17)

316

OP '+' (1, 18) (1, 19)

317

STRING 'Ur"abc"' (1, 20) (1, 27)

318

OP '+' (1, 28) (1, 29)

319

STRING 'UR"abc"' (1, 30) (1, 37)

320

321

>>> dump_tokens("b'abc' + B'abc'")

322

ENCODING 'utf-8' (0, 0) (0, 0)

323

STRING "b'abc'" (1, 0) (1, 6)

324

OP '+' (1, 7) (1, 8)

325

STRING "B'abc'" (1, 9) (1, 15)

326

>>> dump_tokens('b"abc" + B"abc"')

327

ENCODING 'utf-8' (0, 0) (0, 0)

328

STRING 'b"abc"' (1, 0) (1, 6)

329

OP '+' (1, 7) (1, 8)

330

STRING 'B"abc"' (1, 9) (1, 15)

331

>>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")

332

ENCODING 'utf-8' (0, 0) (0, 0)

333

STRING "br'abc'" (1, 0) (1, 7)

334

OP '+' (1, 8) (1, 9)

335

STRING "bR'abc'" (1, 10) (1, 17)

336

OP '+' (1, 18) (1, 19)

337

STRING "Br'abc'" (1, 20) (1, 27)

338

OP '+' (1, 28) (1, 29)

339

STRING "BR'abc'" (1, 30) (1, 37)

340

>>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')

341

ENCODING 'utf-8' (0, 0) (0, 0)

342

STRING 'br"abc"' (1, 0) (1, 7)

343

OP '+' (1, 8) (1, 9)

344

STRING 'bR"abc"' (1, 10) (1, 17)

345

OP '+' (1, 18) (1, 19)

346

STRING 'Br"abc"' (1, 20) (1, 27)

347

OP '+' (1, 28) (1, 29)

348

STRING 'BR"abc"' (1, 30) (1, 37)

349

>>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")

350

ENCODING 'utf-8' (0, 0) (0, 0)

351

STRING "rb'abc'" (1, 0) (1, 7)

352

OP '+' (1, 8) (1, 9)

353

STRING "rB'abc'" (1, 10) (1, 17)

354

OP '+' (1, 18) (1, 19)

355

STRING "Rb'abc'" (1, 20) (1, 27)

356

OP '+' (1, 28) (1, 29)

357

STRING "RB'abc'" (1, 30) (1, 37)

358

>>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')

359

ENCODING 'utf-8' (0, 0) (0, 0)

360

STRING 'rb"abc"' (1, 0) (1, 7)

361

OP '+' (1, 8) (1, 9)

362

STRING 'rB"abc"' (1, 10) (1, 17)

363

OP '+' (1, 18) (1, 19)

364

STRING 'Rb"abc"' (1, 20) (1, 27)

365

OP '+' (1, 28) (1, 29)

366

STRING 'RB"abc"' (1, 30) (1, 37)

367

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

368

Operators

369

370

>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

371

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

372

NAME 'def' (1, 0) (1, 3)

373

NAME 'd22' (1, 4) (1, 7)

374

OP '(' (1, 7) (1, 8)

375

NAME 'a' (1, 8) (1, 9)

376

OP ',' (1, 9) (1, 10)

377

NAME 'b' (1, 11) (1, 12)

378

OP ',' (1, 12) (1, 13)

379

NAME 'c' (1, 14) (1, 15)

380

OP '=' (1, 15) (1, 16)

381

NUMBER '2' (1, 16) (1, 17)

382

OP ',' (1, 17) (1, 18)

383

NAME 'd' (1, 19) (1, 20)

384

OP '=' (1, 20) (1, 21)

385

NUMBER '2' (1, 21) (1, 22)

386

OP ',' (1, 22) (1, 23)

387

OP '*' (1, 24) (1, 25)

388

NAME 'k' (1, 25) (1, 26)

389

OP ')' (1, 26) (1, 27)

390

OP ':' (1, 27) (1, 28)

391

NAME 'pass' (1, 29) (1, 33)

392

>>> dump_tokens("def d01v_(a=1, *k, **w): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

393

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

394

NAME 'def' (1, 0) (1, 3)

395

NAME 'd01v_' (1, 4) (1, 9)

396

OP '(' (1, 9) (1, 10)

397

NAME 'a' (1, 10) (1, 11)

398

OP '=' (1, 11) (1, 12)

399

NUMBER '1' (1, 12) (1, 13)

400

OP ',' (1, 13) (1, 14)

401

OP '*' (1, 15) (1, 16)

402

NAME 'k' (1, 16) (1, 17)

403

OP ',' (1, 17) (1, 18)

404

OP '**' (1, 19) (1, 21)

405

NAME 'w' (1, 21) (1, 22)

406

OP ')' (1, 22) (1, 23)

407

OP ':' (1, 23) (1, 24)

408

NAME 'pass' (1, 25) (1, 29)

Comparison

>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +

413

... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

414

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

415

NAME 'if' (1, 0) (1, 2)

416

NUMBER '1' (1, 3) (1, 4)

417

OP '<' (1, 5) (1, 6)

418

NUMBER '1' (1, 7) (1, 8)

419

OP '>' (1, 9) (1, 10)

420

NUMBER '1' (1, 11) (1, 12)

421

OP '==' (1, 13) (1, 15)

422

NUMBER '1' (1, 16) (1, 17)

423

OP '>=' (1, 18) (1, 20)

424

NUMBER '5' (1, 21) (1, 22)

425

OP '<=' (1, 23) (1, 25)

426

NUMBER '0x15' (1, 26) (1, 30)

427

OP '<=' (1, 31) (1, 33)

428

NUMBER '0x12' (1, 34) (1, 38)

429

OP '!=' (1, 39) (1, 41)

430

NUMBER '1' (1, 42) (1, 43)

431

NAME 'and' (1, 44) (1, 47)

432

NUMBER '5' (1, 48) (1, 49)

433

NAME 'in' (1, 50) (1, 52)

434

NUMBER '1' (1, 53) (1, 54)

435

NAME 'not' (1, 55) (1, 58)

436

NAME 'in' (1, 59) (1, 61)

437

NUMBER '1' (1, 62) (1, 63)

438

NAME 'is' (1, 64) (1, 66)

439

NUMBER '1' (1, 67) (1, 68)

440

NAME 'or' (1, 69) (1, 71)

441

NUMBER '5' (1, 72) (1, 73)

442

NAME 'is' (1, 74) (1, 76)

443

NAME 'not' (1, 77) (1, 80)

444

NUMBER '1' (1, 81) (1, 82)

445

OP ':' (1, 82) (1, 83)

446

NAME 'pass' (1, 84) (1, 88)

Shift

>>> dump_tokens("x = 1 << 1 >> 5")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

451

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

452

NAME 'x' (1, 0) (1, 1)

453

OP '=' (1, 2) (1, 3)

454

NUMBER '1' (1, 4) (1, 5)

455

OP '<<' (1, 6) (1, 8)

456

NUMBER '1' (1, 9) (1, 10)

457

OP '>>' (1, 11) (1, 13)

458

NUMBER '5' (1, 14) (1, 15)

Additive

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

462

>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

463

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

464

NAME 'x' (1, 0) (1, 1)

465

OP '=' (1, 2) (1, 3)

466

NUMBER '1' (1, 4) (1, 5)

467

OP '-' (1, 6) (1, 7)

468

NAME 'y' (1, 8) (1, 9)

469

OP '+' (1, 10) (1, 11)

470

NUMBER '15' (1, 12) (1, 14)

471

OP '-' (1, 15) (1, 16)

Mark Dickinson

2008-03-16 05:05:12 +0000

[diff] [blame]

472

NUMBER '1' (1, 17) (1, 18)

473

OP '+' (1, 19) (1, 20)

474

NUMBER '0x124' (1, 21) (1, 26)

475

OP '+' (1, 27) (1, 28)

476

NAME 'z' (1, 29) (1, 30)

477

OP '+' (1, 31) (1, 32)

478

NAME 'a' (1, 33) (1, 34)

479

OP '[' (1, 34) (1, 35)

480

NUMBER '5' (1, 35) (1, 36)

481

OP ']' (1, 36) (1, 37)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

Multiplicative

>>> dump_tokens("x = 1//1*1/5*12%0x12")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

486

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

487

NAME 'x' (1, 0) (1, 1)

488

OP '=' (1, 2) (1, 3)

489

NUMBER '1' (1, 4) (1, 5)

490

OP '//' (1, 5) (1, 7)

491

NUMBER '1' (1, 7) (1, 8)

492

OP '*' (1, 8) (1, 9)

493

NUMBER '1' (1, 9) (1, 10)

494

OP '/' (1, 10) (1, 11)

495

NUMBER '5' (1, 11) (1, 12)

496

OP '*' (1, 12) (1, 13)

497

NUMBER '12' (1, 13) (1, 15)

498

OP '%' (1, 15) (1, 16)

499

NUMBER '0x12' (1, 16) (1, 20)

Unary

>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

504

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

505

OP '~' (1, 0) (1, 1)

506

NUMBER '1' (1, 1) (1, 2)

507

OP '^' (1, 3) (1, 4)

508

NUMBER '1' (1, 5) (1, 6)

509

OP '&' (1, 7) (1, 8)

510

NUMBER '1' (1, 9) (1, 10)

511

OP '|' (1, 11) (1, 12)

512

NUMBER '1' (1, 12) (1, 13)

513

OP '^' (1, 14) (1, 15)

514

OP '-' (1, 16) (1, 17)

515

NUMBER '1' (1, 17) (1, 18)

516

>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

517

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

518

OP '-' (1, 0) (1, 1)

519

NUMBER '1' (1, 1) (1, 2)

520

OP '*' (1, 2) (1, 3)

521

NUMBER '1' (1, 3) (1, 4)

522

OP '/' (1, 4) (1, 5)

523

NUMBER '1' (1, 5) (1, 6)

524

OP '+' (1, 6) (1, 7)

525

NUMBER '1' (1, 7) (1, 8)

526

OP '*' (1, 8) (1, 9)

527

NUMBER '1' (1, 9) (1, 10)

528

OP '//' (1, 10) (1, 12)

529

NUMBER '1' (1, 12) (1, 13)

530

OP '-' (1, 14) (1, 15)

531

OP '-' (1, 16) (1, 17)

532

OP '-' (1, 17) (1, 18)

533

OP '-' (1, 18) (1, 19)

534

NUMBER '1' (1, 19) (1, 20)

535

OP '**' (1, 20) (1, 22)

536

NUMBER '1' (1, 22) (1, 23)

Selector

>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

541

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

542

NAME 'import' (1, 0) (1, 6)

543

NAME 'sys' (1, 7) (1, 10)

544

OP ',' (1, 10) (1, 11)

545

NAME 'time' (1, 12) (1, 16)

546

NEWLINE '\\n' (1, 16) (1, 17)

547

NAME 'x' (2, 0) (2, 1)

548

OP '=' (2, 2) (2, 3)

549

NAME 'sys' (2, 4) (2, 7)

550

OP '.' (2, 7) (2, 8)

551

NAME 'modules' (2, 8) (2, 15)

552

OP '[' (2, 15) (2, 16)

553

STRING "'time'" (2, 16) (2, 22)

554

OP ']' (2, 22) (2, 23)

555

OP '.' (2, 23) (2, 24)

556

NAME 'time' (2, 24) (2, 28)

557

OP '(' (2, 28) (2, 29)

558

OP ')' (2, 29) (2, 30)

Methods

>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

563

ENCODING 'utf-8' (0, 0) (0, 0)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

564

OP '@' (1, 0) (1, 1)

565

NAME 'staticmethod (1, 1) (1, 13)

566

NEWLINE '\\n' (1, 13) (1, 14)

567

NAME 'def' (2, 0) (2, 3)

568

NAME 'foo' (2, 4) (2, 7)

569

OP '(' (2, 7) (2, 8)

570

NAME 'x' (2, 8) (2, 9)

571

OP ',' (2, 9) (2, 10)

572

NAME 'y' (2, 10) (2, 11)

573

OP ')' (2, 11) (2, 12)

574

OP ':' (2, 12) (2, 13)

575

NAME 'pass' (2, 14) (2, 18)

576

577

Backslash means line continuation, except for comments

578

579

>>> roundtrip("x=1+\\\\n"

580

... "1\\n"

581

... "# This is a comment\\\\n"

582

... "# This also\\n")

583

True

584

>>> roundtrip("# Comment \\\\nx = 0")

585

True

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

586

587

Two string literals on the same line

588

589

>>> roundtrip("'' ''")

590

True

591

592

Test roundtrip on random python modules.

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

593

pass the '-ucpu' option to process the full directory.

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

594

595

>>> import random

596

>>> tempdir = os.path.dirname(f) or os.curdir

597

>>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))

598

Benjamin Peterson

963e402

2011-08-13 00:33:21 -0500

[diff] [blame]

599

tokenize is broken on test_pep3131.py because regular expressions are broken on

600

the obscure unicode identifiers in it. *sigh*

601

>>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))

Antoine Pitrou

5bc4fa7

2010-10-14 15:34:31 +0000

[diff] [blame]

602

>>> if not support.is_resource_enabled("cpu"):

Christian Heimes

2008-03-28 00:55:15 +0000

[diff] [blame]

603

... testfiles = random.sample(testfiles, 10)

604

...

605

>>> for testfile in testfiles:

606

... if not roundtrip(open(testfile, 'rb')):

607

... print("Roundtrip failed for file %s" % testfile)

608

... break

609

... else: True

610

True

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

611

612

Evil tabs

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

613

Benjamin Peterson

a0dfa82

2009-11-13 02:25:08 +0000

[diff] [blame]

614

>>> dump_tokens("def f():\\n\\tif x\\n \\tpass")

615

ENCODING 'utf-8' (0, 0) (0, 0)

616

NAME 'def' (1, 0) (1, 3)

617

NAME 'f' (1, 4) (1, 5)

OP '(' (1, 5) (1, 6)

OP ')' (1, 6) (1, 7)

OP ':' (1, 7) (1, 8)

NEWLINE '\\n' (1, 8) (1, 9)

622

INDENT '\\t' (2, 0) (2, 1)

623

NAME 'if' (2, 1) (2, 3)

624

NAME 'x' (2, 4) (2, 5)

625

NEWLINE '\\n' (2, 5) (2, 6)

626

INDENT ' \\t' (3, 0) (3, 9)

627

NAME 'pass' (3, 9) (3, 13)

628

DEDENT '' (4, 0) (4, 0)

629

DEDENT '' (4, 0) (4, 0)

Benjamin Peterson

33856de

2010-08-30 14:41:20 +0000

[diff] [blame]

630

631

Non-ascii identifiers

632

633

>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")

634

ENCODING 'utf-8' (0, 0) (0, 0)

635

NAME 'Örter' (1, 0) (1, 5)

636

OP '=' (1, 6) (1, 7)

637

STRING "'places'" (1, 8) (1, 16)

638

NEWLINE '\\n' (1, 16) (1, 17)

639

NAME 'grün' (2, 0) (2, 4)

640

OP '=' (2, 5) (2, 6)

641

STRING "'green'" (2, 7) (2, 14)

Armin Ronacher

c0eaeca

2012-03-04 13:07:57 +0000

[diff] [blame]

642

643

Legacy unicode literals:

644

645

>>> dump_tokens("Örter = u'places'\\ngrün = UR'green'")

646

ENCODING 'utf-8' (0, 0) (0, 0)

647

NAME 'Örter' (1, 0) (1, 5)

648

OP '=' (1, 6) (1, 7)

649

STRING "u'places'" (1, 8) (1, 17)

650

NEWLINE '\\n' (1, 17) (1, 18)

651

NAME 'grün' (2, 0) (2, 4)

652

OP '=' (2, 5) (2, 6)

653

STRING "UR'green'" (2, 7) (2, 16)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

654

"""

655

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

656

from test import support

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

657

from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

658

STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

659

open as tokenize_open)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

660

from io import BytesIO

661

from unittest import TestCase

662

import os, sys, glob

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

663

import token

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

664

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

665

def dump_tokens(s):

666

"""Print out the tokens in s in a table format.

667

668

The ENDMARKER is omitted.

669

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

670

f = BytesIO(s.encode('utf-8'))

671

for type, token, start, end, line in tokenize(f.readline):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

672

if type == ENDMARKER:

673

break

674

type = tok_name[type]

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

675

print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

676

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

677

def roundtrip(f):

678

"""

679

Test roundtrip for `untokenize`. `f` is an open file or a string.

680

The source code in f is tokenized, converted back to source code via

681

tokenize.untokenize(), and tokenized again from the latter. The test

682

fails if the second tokenization doesn't match the first.

683

"""

684

if isinstance(f, str):

685

f = BytesIO(f.encode('utf-8'))

Brian Curtin

9f5f65c

2010-10-30 21:35:28 +0000

[diff] [blame]

686

try:

687

token_list = list(tokenize(f.readline))

688

finally:

689

f.close()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

690

tokens1 = [tok[:2] for tok in token_list]

691

new_bytes = untokenize(tokens1)

Ezio Melotti

d8b509b

2011-09-28 17:37:55 +0300

[diff] [blame]

692

readline = (line for line in new_bytes.splitlines(keepends=True)).__next__

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

693

tokens2 = [tok[:2] for tok in tokenize(readline)]

694

return tokens1 == tokens2

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

695

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

696

# This is an example from the docs, set up as a doctest.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

697

def decistmt(s):

698

"""Substitute Decimals for floats in a string of statements.

699

700

>>> from decimal import Decimal

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

701

>>> s = 'print(+21.3e-5*-.1234/81.7)'

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

702

>>> decistmt(s)

Georg Brandl

88fc664

2007-02-09 21:28:07 +0000

[diff] [blame]

703

"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

704

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

705

The format of the exponent is inherited from the platform C library.

706

Known cases are "e-007" (Windows) and "e-07" (not Windows). Since

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

707

we're only showing 11 digits, and the 12th isn't close to 5, the

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

708

rest of the output should be platform-independent.

709

710

>>> exec(s) #doctest: +ELLIPSIS

Mark Dickinson

388122d

2010-08-04 20:56:28 +0000

[diff] [blame]

711

-3.2171603427...e-0...7

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

712

713

Output from calculations with Decimal should be identical across all

714

platforms.

715

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

716

>>> exec(decistmt(s))

717

-3.217160342717258261933904529E-7

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

718

"""

719

result = []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

720

g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

721

for toknum, tokval, _, _, _ in g:

722

if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens

result.extend([

(NAME, 'Decimal'),

(OP, '('),

(STRING, repr(tokval)),

(OP, ')')

])

else:

result.append((toknum, tokval))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

731

return untokenize(result).decode('utf-8')

732

733

734

class TestTokenizerAdheresToPep0263(TestCase):

735

"""

736

Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.

737

"""

738

739

def _testFile(self, filename):

740

path = os.path.join(os.path.dirname(__file__), filename)

741

return roundtrip(open(path, 'rb'))

742

743

def test_utf8_coding_cookie_and_no_utf8_bom(self):

Ned Deily

2ea6fcc

2011-07-19 16:15:27 -0700

[diff] [blame]

744

f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

745

self.assertTrue(self._testFile(f))

746

747

def test_latin1_coding_cookie_and_utf8_bom(self):

748

"""

749

As per PEP 0263, if a file starts with a utf-8 BOM signature, the only

750

allowed encoding for the comment is 'utf-8'. The text file used in

751

this test starts with a BOM signature, but specifies latin1 as the

752

coding, so verify that a SyntaxError is raised, which matches the

753

behaviour of the interpreter when it encounters a similar condition.

754

"""

755

f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'

Benjamin Peterson

c9c0f20

2009-06-30 23:06:06 +0000

[diff] [blame]

756

self.assertRaises(SyntaxError, self._testFile, f)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

757

758

def test_no_coding_cookie_and_utf8_bom(self):

759

f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'

760

self.assertTrue(self._testFile(f))

761

762

def test_utf8_coding_cookie_and_utf8_bom(self):

763

f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'

764

self.assertTrue(self._testFile(f))

765

766

767

class Test_Tokenize(TestCase):

768

769

def test__tokenize_decodes_with_specified_encoding(self):

770

literal = '"ЉЊЈЁЂ"'

771

line = literal.encode('utf-8')

first = False

def readline():

nonlocal first

if not first:

first = True

return line

else:

return b''

# skip the initial encoding token and the end token

782

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

783

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

784

self.assertEqual(tokens, expected_tokens,

785

"bytes not decoded with encoding")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

786

787

def test__tokenize_does_not_decode_with_encoding_none(self):

literal = '"ЉЊЈЁЂ"'

first = False

def readline():

nonlocal first

if not first:

first = True

return literal

else:

return b''

# skip the end token

tokens = list(_tokenize(readline, encoding=None))[:-1]

800

expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

801

self.assertEqual(tokens, expected_tokens,

802

"string not tokenized when encoding is None")

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

803

804

805

class TestDetectEncoding(TestCase):

806

807

def get_readline(self, lines):

index = 0

def readline():

nonlocal index

if index == len(lines):

raise StopIteration

line = lines[index]

index += 1

return line

return readline

def test_no_bom_no_encoding_cookie(self):

819

lines = (

820

b'# something\n',

821

b'print(something)\n',

822

b'do_something(else)\n'

823

)

824

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

825

self.assertEqual(encoding, 'utf-8')

826

self.assertEqual(consumed_lines, list(lines[:2]))

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

827

828

def test_bom_no_cookie(self):

829

lines = (

830

b'\xef\xbb\xbf# something\n',

831

b'print(something)\n',

832

b'do_something(else)\n'

833

)

834

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

835

self.assertEqual(encoding, 'utf-8-sig')

836

self.assertEqual(consumed_lines,

837

[b'# something\n', b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

838

839

def test_cookie_first_line_no_bom(self):

840

lines = (

841

b'# -*- coding: latin-1 -*-\n',

842

b'print(something)\n',

843

b'do_something(else)\n'

844

)

845

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

846

self.assertEqual(encoding, 'iso-8859-1')

847

self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

848

849

def test_matched_bom_and_cookie_first_line(self):

850

lines = (

851

b'\xef\xbb\xbf# coding=utf-8\n',

852

b'print(something)\n',

853

b'do_something(else)\n'

854

)

855

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

856

self.assertEqual(encoding, 'utf-8-sig')

857

self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

858

859

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):

860

lines = (

861

b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',

862

b'print(something)\n',

863

b'do_something(else)\n'

864

)

865

readline = self.get_readline(lines)

866

self.assertRaises(SyntaxError, detect_encoding, readline)

867

868

def test_cookie_second_line_no_bom(self):

869

lines = (

870

b'#! something\n',

871

b'# vim: set fileencoding=ascii :\n',

872

b'print(something)\n',

873

b'do_something(else)\n'

874

)

875

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

876

self.assertEqual(encoding, 'ascii')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

877

expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

878

self.assertEqual(consumed_lines, expected)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

879

880

def test_matched_bom_and_cookie_second_line(self):

881

lines = (

882

b'\xef\xbb\xbf#! something\n',

883

b'f# coding=utf-8\n',

884

b'print(something)\n',

885

b'do_something(else)\n'

886

)

887

encoding, consumed_lines = detect_encoding(self.get_readline(lines))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

888

self.assertEqual(encoding, 'utf-8-sig')

889

self.assertEqual(consumed_lines,

890

[b'#! something\n', b'f# coding=utf-8\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

891

892

def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):

893

lines = (

894

b'\xef\xbb\xbf#! something\n',

895

b'# vim: set fileencoding=ascii :\n',

896

b'print(something)\n',

897

b'do_something(else)\n'

898

)

899

readline = self.get_readline(lines)

900

self.assertRaises(SyntaxError, detect_encoding, readline)

901

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

902

def test_latin1_normalization(self):

903

# See get_normal_name() in tokenizer.c.

904

encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

905

"iso-8859-1-unix", "iso-latin-1-mac")

906

for encoding in encodings:

907

for rep in ("-", "_"):

908

enc = encoding.replace("-", rep)

909

lines = (b"#!/usr/bin/python\n",

910

b"# coding: " + enc.encode("ascii") + b"\n",

911

b"print(things)\n",

912

b"do_something += 4\n")

913

rl = self.get_readline(lines)

914

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

915

self.assertEqual(found, "iso-8859-1")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

916

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

917

def test_syntaxerror_latin1(self):

918

# Issue 14629: need to raise SyntaxError if the first

919

# line(s) have non-UTF-8 characters

920

lines = (

921

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

922

)

923

readline = self.get_readline(lines)

924

self.assertRaises(SyntaxError, detect_encoding, readline)

925

926

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

927

def test_utf8_normalization(self):

928

# See get_normal_name() in tokenizer.c.

929

encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

930

for encoding in encodings:

931

for rep in ("-", "_"):

932

enc = encoding.replace("-", rep)

933

lines = (b"#!/usr/bin/python\n",

934

b"# coding: " + enc.encode("ascii") + b"\n",

935

b"1 + 3\n")

936

rl = self.get_readline(lines)

937

found, consumed_lines = detect_encoding(rl)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

938

self.assertEqual(found, "utf-8")

Benjamin Peterson

2009-10-09 21:43:09 +0000

[diff] [blame]

939

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

940

def test_short_files(self):

941

readline = self.get_readline((b'print(something)\n',))

942

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

943

self.assertEqual(encoding, 'utf-8')

944

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

945

946

encoding, consumed_lines = detect_encoding(self.get_readline(()))

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

947

self.assertEqual(encoding, 'utf-8')

948

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

949

950

readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))

951

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

952

self.assertEqual(encoding, 'utf-8-sig')

953

self.assertEqual(consumed_lines, [b'print(something)\n'])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

954

955

readline = self.get_readline((b'\xef\xbb\xbf',))

956

encoding, consumed_lines = detect_encoding(readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

957

self.assertEqual(encoding, 'utf-8-sig')

958

self.assertEqual(consumed_lines, [])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

959

Benjamin Peterson

433f32c

2008-12-12 01:25:05 +0000

[diff] [blame]

960

readline = self.get_readline((b'# coding: bad\n',))

961

self.assertRaises(SyntaxError, detect_encoding, readline)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

962

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

963

def test_open(self):

964

filename = support.TESTFN + '.py'

965

self.addCleanup(support.unlink, filename)

966

967

# test coding cookie

968

for encoding in ('iso-8859-15', 'utf-8'):

969

with open(filename, 'w', encoding=encoding) as fp:

970

print("# coding: %s" % encoding, file=fp)

971

print("print('euro:\u20ac')", file=fp)

972

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

973

self.assertEqual(fp.encoding, encoding)

974

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

975

976

# test BOM (no coding cookie)

977

with open(filename, 'w', encoding='utf-8-sig') as fp:

978

print("print('euro:\u20ac')", file=fp)

979

with tokenize_open(filename) as fp:

Victor Stinner

92665ab

2010-11-09 01:11:31 +0000

[diff] [blame]

980

self.assertEqual(fp.encoding, 'utf-8-sig')

981

self.assertEqual(fp.mode, 'r')

Victor Stinner

2010-11-09 01:08:59 +0000

[diff] [blame]

982

Brett Cannon

c33f3f2

2012-04-20 13:23:54 -0400

[diff] [blame]

983

def test_filename_in_exception(self):

984

# When possible, include the file name in the exception.

985

path = 'some_file_path'

986

lines = (

987

b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S

988

)

989

class Bunk:

990

def __init__(self, lines, path):

self.name = path

self._lines = lines

self._index = 0

def readline(self):

if self._index == len(lines):

997

raise StopIteration

998

line = lines[self._index]

self._index += 1

return line

with self.assertRaises(SyntaxError):

1003

ins = Bunk(lines, path)

1004

# Make sure lacking a name isn't an issue.

1005

del ins.name

1006

detect_encoding(ins.readline)

1007

with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):

1008

ins = Bunk(lines, path)

1009

detect_encoding(ins.readline)

1010

1011

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1012

class TestTokenize(TestCase):

1013

1014

def test_tokenize(self):

1015

import tokenize as tokenize_module

1016

encoding = object()

1017

encoding_used = None

1018

def mock_detect_encoding(readline):

1019

return encoding, ['first', 'second']

1020

1021

def mock__tokenize(readline, encoding):

1022

nonlocal encoding_used

1023

encoding_used = encoding

1024

out = []

1025

while True:

1026

next_line = readline()

1027

if next_line:

1028

out.append(next_line)

continue

return out

counter = 0

def mock_readline():

nonlocal counter

counter += 1

if counter == 5:

return b''

return counter

orig_detect_encoding = tokenize_module.detect_encoding

1041

orig__tokenize = tokenize_module._tokenize

1042

tokenize_module.detect_encoding = mock_detect_encoding

1043

tokenize_module._tokenize = mock__tokenize

1044

try:

1045

results = tokenize(mock_readline)

Ezio Melotti

2010-11-20 19:04:17 +0000

[diff] [blame]

1046

self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

1047

finally:

1048

tokenize_module.detect_encoding = orig_detect_encoding

1049

tokenize_module._tokenize = orig__tokenize

1050

1051

self.assertTrue(encoding_used, encoding)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

1052

Meador Inge

00c7f85

2012-01-19 00:44:45 -0600

[diff] [blame]

1053

def assertExactTypeEqual(self, opstr, *optypes):

1054

tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))

1055

num_optypes = len(optypes)

1056

self.assertEqual(len(tokens), 2 + num_optypes)

1057

self.assertEqual(token.tok_name[tokens[0].exact_type],

1058

token.tok_name[ENCODING])

1059

for i in range(num_optypes):

1060

self.assertEqual(token.tok_name[tokens[i + 1].exact_type],

1061

token.tok_name[optypes[i]])

1062

self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],

1063

token.tok_name[token.ENDMARKER])

1064

1065

def test_exact_type(self):

1066

self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

1067

self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)

1068

self.assertExactTypeEqual(':', token.COLON)

1069

self.assertExactTypeEqual(',', token.COMMA)

1070

self.assertExactTypeEqual(';', token.SEMI)

1071

self.assertExactTypeEqual('+', token.PLUS)

1072

self.assertExactTypeEqual('-', token.MINUS)

1073

self.assertExactTypeEqual('*', token.STAR)

1074

self.assertExactTypeEqual('/', token.SLASH)

1075

self.assertExactTypeEqual('|', token.VBAR)

1076

self.assertExactTypeEqual('&', token.AMPER)

1077

self.assertExactTypeEqual('<', token.LESS)

1078

self.assertExactTypeEqual('>', token.GREATER)

1079

self.assertExactTypeEqual('=', token.EQUAL)

1080

self.assertExactTypeEqual('.', token.DOT)

1081

self.assertExactTypeEqual('%', token.PERCENT)

1082

self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)

1083

self.assertExactTypeEqual('==', token.EQEQUAL)

1084

self.assertExactTypeEqual('!=', token.NOTEQUAL)

1085

self.assertExactTypeEqual('<=', token.LESSEQUAL)

1086

self.assertExactTypeEqual('>=', token.GREATEREQUAL)

1087

self.assertExactTypeEqual('~', token.TILDE)

1088

self.assertExactTypeEqual('^', token.CIRCUMFLEX)

1089

self.assertExactTypeEqual('<<', token.LEFTSHIFT)

1090

self.assertExactTypeEqual('>>', token.RIGHTSHIFT)

1091

self.assertExactTypeEqual('**', token.DOUBLESTAR)

1092

self.assertExactTypeEqual('+=', token.PLUSEQUAL)

1093

self.assertExactTypeEqual('-=', token.MINEQUAL)

1094

self.assertExactTypeEqual('*=', token.STAREQUAL)

1095

self.assertExactTypeEqual('/=', token.SLASHEQUAL)

1096

self.assertExactTypeEqual('%=', token.PERCENTEQUAL)

1097

self.assertExactTypeEqual('&=', token.AMPEREQUAL)

1098

self.assertExactTypeEqual('|=', token.VBAREQUAL)

1099

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1100

self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)

1101

self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)

1102

self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)

1103

self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)

1104

self.assertExactTypeEqual('//', token.DOUBLESLASH)

1105

self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)

1106

self.assertExactTypeEqual('@', token.AT)

1107

1108

self.assertExactTypeEqual('a**2+b**2==c**2',

1109

NAME, token.DOUBLESTAR, NUMBER,

1110

token.PLUS,

1111

NAME, token.DOUBLESTAR, NUMBER,

1112

token.EQEQUAL,

1113

NAME, token.DOUBLESTAR, NUMBER)

1114

self.assertExactTypeEqual('{1, 2, 3}',

1115

token.LBRACE,

1116

token.NUMBER, token.COMMA,

1117

token.NUMBER, token.COMMA,

1118

token.NUMBER,

1119

token.RBRACE)

1120

self.assertExactTypeEqual('^(x & 0x1)',

1121

token.CIRCUMFLEX,

1122

token.LPAR,

1123

token.NAME, token.AMPER, token.NUMBER,

1124

token.RPAR)

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1125

1126

__test__ = {"doctests" : doctests, 'decistmt': decistmt}

1127

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame]

1128

def test_main():

Christian Heimes

2008-03-16 00:07:10 +0000

[diff] [blame]

1129

from test import test_tokenize

Benjamin Peterson

ee8712c

2008-05-20 21:35:26 +0000

[diff] [blame]

1130

support.run_doctest(test_tokenize, True)

1131

support.run_unittest(TestTokenizerAdheresToPep0263)

1132

support.run_unittest(Test_Tokenize)

1133

support.run_unittest(TestDetectEncoding)

1134

support.run_unittest(TestTokenize)

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

1135

Thomas Wouters