blob: eb96471f2df460e94adf8c8f010afbe2bd884cbc [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000030 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000031 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000055 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000056 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwald6e390802007-08-17 16:41:28 +0000248class UTF32Test(ReadTest):
249 encoding = "utf-32"
250
251 spamle = ('\xff\xfe\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
253 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
254 spambe = ('\x00\x00\xfe\xff'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
256 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
257
258 def test_only_one_bom(self):
259 _,_,reader,writer = codecs.lookup(self.encoding)
260 # encode some stream
261 s = StringIO.StringIO()
262 f = writer(s)
263 f.write(u"spam")
264 f.write(u"spam")
265 d = s.getvalue()
266 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000267 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000268 # try to read it back
269 s = StringIO.StringIO(d)
270 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000271 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000272
273 def test_badbom(self):
274 s = StringIO.StringIO(4*"\xff")
275 f = codecs.getreader(self.encoding)(s)
276 self.assertRaises(UnicodeError, f.read)
277
278 s = StringIO.StringIO(8*"\xff")
279 f = codecs.getreader(self.encoding)(s)
280 self.assertRaises(UnicodeError, f.read)
281
282 def test_partial(self):
283 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200284 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000285 [
286 u"", # first byte of BOM read
287 u"", # second byte of BOM read
288 u"", # third byte of BOM read
289 u"", # fourth byte of BOM read => byteorder known
290 u"",
291 u"",
292 u"",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100",
305 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200306 u"\x00\xff\u0100\uffff",
307 u"\x00\xff\u0100\uffff",
308 u"\x00\xff\u0100\uffff",
309 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000310 ]
311 )
312
Georg Brandle9741f32009-09-17 11:28:09 +0000313 def test_handlers(self):
314 self.assertEqual((u'\ufffd', 1),
315 codecs.utf_32_decode('\x01', 'replace', True))
316 self.assertEqual((u'', 1),
317 codecs.utf_32_decode('\x01', 'ignore', True))
318
Walter Dörwald6e390802007-08-17 16:41:28 +0000319 def test_errors(self):
320 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
321 "\xff", "strict", True)
322
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000323 def test_issue8941(self):
324 # Issue #8941: insufficient result allocation when decoding into
325 # surrogate pairs on UCS-2 builds.
326 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
327 self.assertEqual(u'\U00010000' * 1024,
328 codecs.utf_32_decode(encoded_le)[0])
329 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
330 self.assertEqual(u'\U00010000' * 1024,
331 codecs.utf_32_decode(encoded_be)[0])
332
Walter Dörwald6e390802007-08-17 16:41:28 +0000333class UTF32LETest(ReadTest):
334 encoding = "utf-32-le"
335
336 def test_partial(self):
337 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200338 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000339 [
340 u"",
341 u"",
342 u"",
343 u"\x00",
344 u"\x00",
345 u"\x00",
346 u"\x00",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff",
350 u"\x00\xff",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100",
354 u"\x00\xff\u0100",
355 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff",
358 u"\x00\xff\u0100\uffff",
359 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000360 ]
361 )
362
363 def test_simple(self):
364 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
365
366 def test_errors(self):
367 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
368 "\xff", "strict", True)
369
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000370 def test_issue8941(self):
371 # Issue #8941: insufficient result allocation when decoding into
372 # surrogate pairs on UCS-2 builds.
373 encoded = '\x00\x00\x01\x00' * 1024
374 self.assertEqual(u'\U00010000' * 1024,
375 codecs.utf_32_le_decode(encoded)[0])
376
Walter Dörwald6e390802007-08-17 16:41:28 +0000377class UTF32BETest(ReadTest):
378 encoding = "utf-32-be"
379
380 def test_partial(self):
381 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200382 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000383 [
384 u"",
385 u"",
386 u"",
387 u"\x00",
388 u"\x00",
389 u"\x00",
390 u"\x00",
391 u"\x00\xff",
392 u"\x00\xff",
393 u"\x00\xff",
394 u"\x00\xff",
395 u"\x00\xff\u0100",
396 u"\x00\xff\u0100",
397 u"\x00\xff\u0100",
398 u"\x00\xff\u0100",
399 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200400 u"\x00\xff\u0100\uffff",
401 u"\x00\xff\u0100\uffff",
402 u"\x00\xff\u0100\uffff",
403 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000404 ]
405 )
406
407 def test_simple(self):
408 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
409
410 def test_errors(self):
411 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
412 "\xff", "strict", True)
413
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000414 def test_issue8941(self):
415 # Issue #8941: insufficient result allocation when decoding into
416 # surrogate pairs on UCS-2 builds.
417 encoded = '\x00\x01\x00\x00' * 1024
418 self.assertEqual(u'\U00010000' * 1024,
419 codecs.utf_32_be_decode(encoded)[0])
420
421
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000422class UTF16Test(ReadTest):
423 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000424
425 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
426 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
427
428 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000429 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430 # encode some stream
431 s = StringIO.StringIO()
432 f = writer(s)
433 f.write(u"spam")
434 f.write(u"spam")
435 d = s.getvalue()
436 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000437 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000438 # try to read it back
439 s = StringIO.StringIO(d)
440 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000441 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000442
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000443 def test_badbom(self):
444 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000449 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000450 self.assertRaises(UnicodeError, f.read)
451
Walter Dörwald69652032004-09-07 20:24:22 +0000452 def test_partial(self):
453 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200454 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000455 [
456 u"", # first byte of BOM read
457 u"", # second byte of BOM read => byteorder known
458 u"",
459 u"\x00",
460 u"\x00",
461 u"\x00\xff",
462 u"\x00\xff",
463 u"\x00\xff\u0100",
464 u"\x00\xff\u0100",
465 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200466 u"\x00\xff\u0100\uffff",
467 u"\x00\xff\u0100\uffff",
468 u"\x00\xff\u0100\uffff",
469 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000470 ]
471 )
472
Georg Brandle9741f32009-09-17 11:28:09 +0000473 def test_handlers(self):
474 self.assertEqual((u'\ufffd', 1),
475 codecs.utf_16_decode('\x01', 'replace', True))
476 self.assertEqual((u'', 1),
477 codecs.utf_16_decode('\x01', 'ignore', True))
478
Walter Dörwalde22d3392005-11-17 08:52:34 +0000479 def test_errors(self):
480 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
481
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000482 def test_bug691291(self):
483 # Files are always opened in binary mode, even if no binary mode was
484 # specified. This means that no automatic conversion of '\n' is done
485 # on reading and writing.
486 s1 = u'Hello\r\nworld\r\n'
487
488 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200489 self.addCleanup(test_support.unlink, test_support.TESTFN)
490 with open(test_support.TESTFN, 'wb') as fp:
491 fp.write(s)
492 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
493 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000494
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000495class UTF16LETest(ReadTest):
496 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000497
498 def test_partial(self):
499 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200500 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000501 [
502 u"",
503 u"\x00",
504 u"\x00",
505 u"\x00\xff",
506 u"\x00\xff",
507 u"\x00\xff\u0100",
508 u"\x00\xff\u0100",
509 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200510 u"\x00\xff\u0100\uffff",
511 u"\x00\xff\u0100\uffff",
512 u"\x00\xff\u0100\uffff",
513 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000514 ]
515 )
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200518 tests = [
519 (b'\xff', u'\ufffd'),
520 (b'A\x00Z', u'A\ufffd'),
521 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
522 (b'\x00\xd8', u'\ufffd'),
523 (b'\x00\xd8A', u'\ufffd'),
524 (b'\x00\xd8A\x00', u'\ufffdA'),
525 (b'\x00\xdcA\x00', u'\ufffdA'),
526 ]
527 for raw, expected in tests:
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
529 raw, 'strict', True)
530 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000531
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000532class UTF16BETest(ReadTest):
533 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000534
535 def test_partial(self):
536 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200537 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000538 [
539 u"",
540 u"\x00",
541 u"\x00",
542 u"\x00\xff",
543 u"\x00\xff",
544 u"\x00\xff\u0100",
545 u"\x00\xff\u0100",
546 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200547 u"\x00\xff\u0100\uffff",
548 u"\x00\xff\u0100\uffff",
549 u"\x00\xff\u0100\uffff",
550 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000551 ]
552 )
553
Walter Dörwalde22d3392005-11-17 08:52:34 +0000554 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200555 tests = [
556 (b'\xff', u'\ufffd'),
557 (b'\x00A\xff', u'A\ufffd'),
558 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
559 (b'\xd8\x00', u'\ufffd'),
560 (b'\xd8\x00\xdc', u'\ufffd'),
561 (b'\xd8\x00\x00A', u'\ufffdA'),
562 (b'\xdc\x00\x00A', u'\ufffdA'),
563 ]
564 for raw, expected in tests:
565 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
566 raw, 'strict', True)
567 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF8Test(ReadTest):
570 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200574 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
576 u"\x00",
577 u"\x00",
578 u"\x00\xff",
579 u"\x00\xff",
580 u"\x00\xff\u07ff",
581 u"\x00\xff\u07ff",
582 u"\x00\xff\u07ff",
583 u"\x00\xff\u07ff\u0800",
584 u"\x00\xff\u07ff\u0800",
585 u"\x00\xff\u07ff\u0800",
586 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200587 u"\x00\xff\u07ff\u0800\uffff",
588 u"\x00\xff\u07ff\u0800\uffff",
589 u"\x00\xff\u07ff\u0800\uffff",
590 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000591 ]
592 )
593
Walter Dörwalde22d3392005-11-17 08:52:34 +0000594class UTF7Test(ReadTest):
595 encoding = "utf-7"
596
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000597 def test_partial(self):
598 self.check_partial(
599 u"a+-b",
600 [
601 u"a",
602 u"a",
603 u"a+",
604 u"a+-",
605 u"a+-b",
606 ]
607 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000608
609class UTF16ExTest(unittest.TestCase):
610
611 def test_errors(self):
612 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
613
614 def test_bad_args(self):
615 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
616
617class ReadBufferTest(unittest.TestCase):
618
619 def test_array(self):
620 import array
621 self.assertEqual(
622 codecs.readbuffer_encode(array.array("c", "spam")),
623 ("spam", 4)
624 )
625
626 def test_empty(self):
627 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
628
629 def test_bad_args(self):
630 self.assertRaises(TypeError, codecs.readbuffer_encode)
631 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
632
633class CharBufferTest(unittest.TestCase):
634
635 def test_string(self):
636 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
637
638 def test_empty(self):
639 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
640
641 def test_bad_args(self):
642 self.assertRaises(TypeError, codecs.charbuffer_encode)
643 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
644
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000645class UTF8SigTest(ReadTest):
646 encoding = "utf-8-sig"
647
648 def test_partial(self):
649 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200650 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000651 [
652 u"",
653 u"",
654 u"", # First BOM has been read and skipped
655 u"",
656 u"",
657 u"\ufeff", # Second BOM has been read and emitted
658 u"\ufeff\x00", # "\x00" read and emitted
659 u"\ufeff\x00", # First byte of encoded u"\xff" read
660 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
661 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
662 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
663 u"\ufeff\x00\xff\u07ff",
664 u"\ufeff\x00\xff\u07ff",
665 u"\ufeff\x00\xff\u07ff\u0800",
666 u"\ufeff\x00\xff\u07ff\u0800",
667 u"\ufeff\x00\xff\u07ff\u0800",
668 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200669 u"\ufeff\x00\xff\u07ff\u0800\uffff",
670 u"\ufeff\x00\xff\u07ff\u0800\uffff",
671 u"\ufeff\x00\xff\u07ff\u0800\uffff",
672 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000673 ]
674 )
675
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000676 def test_bug1601501(self):
677 # SF bug #1601501: check that the codec works with a buffer
678 unicode("\xef\xbb\xbf", "utf-8-sig")
679
Walter Dörwald42348272007-04-12 10:35:00 +0000680 def test_bom(self):
681 d = codecs.getincrementaldecoder("utf-8-sig")()
682 s = u"spam"
683 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
684
Walter Dörwald183744d2007-11-19 12:41:10 +0000685 def test_stream_bom(self):
686 unistring = u"ABC\u00A1\u2200XYZ"
687 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
688
689 reader = codecs.getreader("utf-8-sig")
690 for sizehint in [None] + range(1, 11) + \
691 [64, 128, 256, 512, 1024]:
692 istream = reader(StringIO.StringIO(bytestring))
693 ostream = StringIO.StringIO()
694 while 1:
695 if sizehint is not None:
696 data = istream.read(sizehint)
697 else:
698 data = istream.read()
699
700 if not data:
701 break
702 ostream.write(data)
703
704 got = ostream.getvalue()
705 self.assertEqual(got, unistring)
706
707 def test_stream_bare(self):
708 unistring = u"ABC\u00A1\u2200XYZ"
709 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
710
711 reader = codecs.getreader("utf-8-sig")
712 for sizehint in [None] + range(1, 11) + \
713 [64, 128, 256, 512, 1024]:
714 istream = reader(StringIO.StringIO(bytestring))
715 ostream = StringIO.StringIO()
716 while 1:
717 if sizehint is not None:
718 data = istream.read(sizehint)
719 else:
720 data = istream.read()
721
722 if not data:
723 break
724 ostream.write(data)
725
726 got = ostream.getvalue()
727 self.assertEqual(got, unistring)
728
Walter Dörwald8709a422002-09-03 13:53:40 +0000729class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000730 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000731 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000732
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200733 def test_raw(self):
734 for b in ''.join(map(chr, range(256))):
735 if b != '\\':
736 self.assertEqual(codecs.escape_decode(b + '0'),
737 (b + '0', 2))
738
739 def test_escape(self):
740 self.assertEqual(codecs.escape_decode(b"[\\\n]"), (b"[]", 4))
741 self.assertEqual(codecs.escape_decode(br'[\"]'), (b'["]', 4))
742 self.assertEqual(codecs.escape_decode(br"[\']"), (b"[']", 4))
743 self.assertEqual(codecs.escape_decode(br"[\\]"), (br"[\]", 4))
744 self.assertEqual(codecs.escape_decode(br"[\a]"), (b"[\x07]", 4))
745 self.assertEqual(codecs.escape_decode(br"[\b]"), (b"[\x08]", 4))
746 self.assertEqual(codecs.escape_decode(br"[\t]"), (b"[\x09]", 4))
747 self.assertEqual(codecs.escape_decode(br"[\n]"), (b"[\x0a]", 4))
748 self.assertEqual(codecs.escape_decode(br"[\v]"), (b"[\x0b]", 4))
749 self.assertEqual(codecs.escape_decode(br"[\f]"), (b"[\x0c]", 4))
750 self.assertEqual(codecs.escape_decode(br"[\r]"), (b"[\x0d]", 4))
751 self.assertEqual(codecs.escape_decode(br"[\7]"), (b"[\x07]", 4))
752 self.assertEqual(codecs.escape_decode(br"[\8]"), (br"[\8]", 4))
753 self.assertEqual(codecs.escape_decode(br"[\78]"), (b"[\x078]", 5))
754 self.assertEqual(codecs.escape_decode(br"[\41]"), (b"[!]", 5))
755 self.assertEqual(codecs.escape_decode(br"[\418]"), (b"[!8]", 6))
756 self.assertEqual(codecs.escape_decode(br"[\101]"), (b"[A]", 6))
757 self.assertEqual(codecs.escape_decode(br"[\1010]"), (b"[A0]", 7))
758 self.assertEqual(codecs.escape_decode(br"[\501]"), (b"[A]", 6))
759 self.assertEqual(codecs.escape_decode(br"[\x41]"), (b"[A]", 6))
760 self.assertEqual(codecs.escape_decode(br"[\X41]"), (br"[\X41]", 6))
761 self.assertEqual(codecs.escape_decode(br"[\x410]"), (b"[A0]", 7))
762 for b in ''.join(map(chr, range(256))):
763 if b not in '\n"\'\\abtnvfr01234567x':
764 self.assertEqual(codecs.escape_decode('\\' + b),
765 ('\\' + b, 2))
766
767 def test_errors(self):
768 self.assertRaises(ValueError, codecs.escape_decode, br"\x")
769 self.assertRaises(ValueError, codecs.escape_decode, br"[\x]")
770 self.assertEqual(codecs.escape_decode(br"[\x]\x", "ignore"), (b"[]", 6))
771 self.assertEqual(codecs.escape_decode(br"[\x]\x", "replace"), (b"[?]?", 6))
772 self.assertRaises(ValueError, codecs.escape_decode, br"\x0")
773 self.assertRaises(ValueError, codecs.escape_decode, br"[\x0]")
774 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
775 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
776
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000777class RecodingTest(unittest.TestCase):
778 def test_recoding(self):
779 f = StringIO.StringIO()
780 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
781 f2.write(u"a")
782 f2.close()
783 # Python used to crash on this at exit because of a refcount
784 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000785
Martin v. Löwis2548c732003-04-18 10:39:54 +0000786# From RFC 3492
787punycode_testcases = [
788 # A Arabic (Egyptian):
789 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
790 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
791 "egbpdaj6bu4bxfgehfvwxn"),
792 # B Chinese (simplified):
793 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
794 "ihqwcrb4cv8a8dqg056pqjye"),
795 # C Chinese (traditional):
796 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
797 "ihqwctvzc91f659drss3x8bo0yb"),
798 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
799 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
800 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
801 u"\u0065\u0073\u006B\u0079",
802 "Proprostnemluvesky-uyb24dma41a"),
803 # E Hebrew:
804 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
805 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
806 u"\u05D1\u05E8\u05D9\u05EA",
807 "4dbcagdahymbxekheh6e0a7fei0b"),
808 # F Hindi (Devanagari):
809 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
810 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
811 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
812 u"\u0939\u0948\u0902",
813 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
814
815 #(G) Japanese (kanji and hiragana):
816 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
817 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
818 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
819
820 # (H) Korean (Hangul syllables):
821 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
822 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
823 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
824 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
825 "psd879ccm6fea98c"),
826
827 # (I) Russian (Cyrillic):
828 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
829 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
830 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
831 u"\u0438",
832 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
833
834 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
835 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
836 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
837 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
838 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
839 u"\u0061\u00F1\u006F\u006C",
840 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
841
842 # (K) Vietnamese:
843 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
844 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
845 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
846 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
847 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
848 u"\u0056\u0069\u1EC7\u0074",
849 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
850
Martin v. Löwis2548c732003-04-18 10:39:54 +0000851 #(L) 3<nen>B<gumi><kinpachi><sensei>
852 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
853 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000854
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855 # (M) <amuro><namie>-with-SUPER-MONKEYS
856 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
857 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
858 u"\u004F\u004E\u004B\u0045\u0059\u0053",
859 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
860
861 # (N) Hello-Another-Way-<sorezore><no><basho>
862 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
863 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
864 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
865 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
866
867 # (O) <hitotsu><yane><no><shita>2
868 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
869 "2-u9tlzr9756bt3uc0v"),
870
871 # (P) Maji<de>Koi<suru>5<byou><mae>
872 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
873 u"\u308B\u0035\u79D2\u524D",
874 "MajiKoi5-783gue6qz075azm5e"),
875
876 # (Q) <pafii>de<runba>
877 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
878 "de-jg4avhby1noc0d"),
879
880 # (R) <sono><supiido><de>
881 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
882 "d9juau41awczczp"),
883
884 # (S) -> $1.00 <-
885 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
886 u"\u003C\u002D",
887 "-> $1.00 <--")
888 ]
889
890for i in punycode_testcases:
891 if len(i)!=2:
892 print repr(i)
893
894class PunycodeTest(unittest.TestCase):
895 def test_encode(self):
896 for uni, puny in punycode_testcases:
897 # Need to convert both strings to lower case, since
898 # some of the extended encodings use upper case, but our
899 # code produces only lower case. Converting just puny to
900 # lower is also insufficient, since some of the input characters
901 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000902 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903
904 def test_decode(self):
905 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000906 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000907
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000908class UnicodeInternalTest(unittest.TestCase):
909 def test_bug1251300(self):
910 # Decoding with unicode_internal used to not correctly handle "code
911 # points" above 0x10ffff on UCS-4 builds.
912 if sys.maxunicode > 0xffff:
913 ok = [
914 ("\x00\x10\xff\xff", u"\U0010ffff"),
915 ("\x00\x00\x01\x01", u"\U00000101"),
916 ("", u""),
917 ]
918 not_ok = [
919 "\x7f\xff\xff\xff",
920 "\x80\x00\x00\x00",
921 "\x81\x00\x00\x00",
922 "\x00",
923 "\x00\x00\x00\x00\x00",
924 ]
925 for internal, uni in ok:
926 if sys.byteorder == "little":
927 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000928 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000929 for internal in not_ok:
930 if sys.byteorder == "little":
931 internal = "".join(reversed(internal))
932 self.assertRaises(UnicodeDecodeError, internal.decode,
933 "unicode_internal")
934
935 def test_decode_error_attributes(self):
936 if sys.maxunicode > 0xffff:
937 try:
938 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
939 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000940 self.assertEqual("unicode_internal", ex.encoding)
941 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
942 self.assertEqual(4, ex.start)
943 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000944 else:
945 self.fail()
946
947 def test_decode_callback(self):
948 if sys.maxunicode > 0xffff:
949 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
950 decoder = codecs.getdecoder("unicode_internal")
951 ab = u"ab".encode("unicode_internal")
952 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
953 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000954 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000955
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000956 def test_encode_length(self):
957 # Issue 3739
958 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000959 self.assertEqual(encoder(u"a")[1], 1)
960 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000961
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000962 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000963 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000964
Martin v. Löwis2548c732003-04-18 10:39:54 +0000965# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
966nameprep_tests = [
967 # 3.1 Map to nothing.
968 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
969 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
970 '\xb8\x8f\xef\xbb\xbf',
971 'foobarbaz'),
972 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
973 ('CAFE',
974 'cafe'),
975 # 3.3 Case folding 8bit U+00DF (german sharp s).
976 # The original test case is bogus; it says \xc3\xdf
977 ('\xc3\x9f',
978 'ss'),
979 # 3.4 Case folding U+0130 (turkish capital I with dot).
980 ('\xc4\xb0',
981 'i\xcc\x87'),
982 # 3.5 Case folding multibyte U+0143 U+037A.
983 ('\xc5\x83\xcd\xba',
984 '\xc5\x84 \xce\xb9'),
985 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
986 # XXX: skip this as it fails in UCS-2 mode
987 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
988 # 'telc\xe2\x88\x95kg\xcf\x83'),
989 (None, None),
990 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
991 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
992 '\xc7\xb0 a'),
993 # 3.8 Case folding U+1FB7 and normalization.
994 ('\xe1\xbe\xb7',
995 '\xe1\xbe\xb6\xce\xb9'),
996 # 3.9 Self-reverting case folding U+01F0 and normalization.
997 # The original test case is bogus, it says `\xc7\xf0'
998 ('\xc7\xb0',
999 '\xc7\xb0'),
1000 # 3.10 Self-reverting case folding U+0390 and normalization.
1001 ('\xce\x90',
1002 '\xce\x90'),
1003 # 3.11 Self-reverting case folding U+03B0 and normalization.
1004 ('\xce\xb0',
1005 '\xce\xb0'),
1006 # 3.12 Self-reverting case folding U+1E96 and normalization.
1007 ('\xe1\xba\x96',
1008 '\xe1\xba\x96'),
1009 # 3.13 Self-reverting case folding U+1F56 and normalization.
1010 ('\xe1\xbd\x96',
1011 '\xe1\xbd\x96'),
1012 # 3.14 ASCII space character U+0020.
1013 (' ',
1014 ' '),
1015 # 3.15 Non-ASCII 8bit space character U+00A0.
1016 ('\xc2\xa0',
1017 ' '),
1018 # 3.16 Non-ASCII multibyte space character U+1680.
1019 ('\xe1\x9a\x80',
1020 None),
1021 # 3.17 Non-ASCII multibyte space character U+2000.
1022 ('\xe2\x80\x80',
1023 ' '),
1024 # 3.18 Zero Width Space U+200b.
1025 ('\xe2\x80\x8b',
1026 ''),
1027 # 3.19 Non-ASCII multibyte space character U+3000.
1028 ('\xe3\x80\x80',
1029 ' '),
1030 # 3.20 ASCII control characters U+0010 U+007F.
1031 ('\x10\x7f',
1032 '\x10\x7f'),
1033 # 3.21 Non-ASCII 8bit control character U+0085.
1034 ('\xc2\x85',
1035 None),
1036 # 3.22 Non-ASCII multibyte control character U+180E.
1037 ('\xe1\xa0\x8e',
1038 None),
1039 # 3.23 Zero Width No-Break Space U+FEFF.
1040 ('\xef\xbb\xbf',
1041 ''),
1042 # 3.24 Non-ASCII control character U+1D175.
1043 ('\xf0\x9d\x85\xb5',
1044 None),
1045 # 3.25 Plane 0 private use character U+F123.
1046 ('\xef\x84\xa3',
1047 None),
1048 # 3.26 Plane 15 private use character U+F1234.
1049 ('\xf3\xb1\x88\xb4',
1050 None),
1051 # 3.27 Plane 16 private use character U+10F234.
1052 ('\xf4\x8f\x88\xb4',
1053 None),
1054 # 3.28 Non-character code point U+8FFFE.
1055 ('\xf2\x8f\xbf\xbe',
1056 None),
1057 # 3.29 Non-character code point U+10FFFF.
1058 ('\xf4\x8f\xbf\xbf',
1059 None),
1060 # 3.30 Surrogate code U+DF42.
1061 ('\xed\xbd\x82',
1062 None),
1063 # 3.31 Non-plain text character U+FFFD.
1064 ('\xef\xbf\xbd',
1065 None),
1066 # 3.32 Ideographic description character U+2FF5.
1067 ('\xe2\xbf\xb5',
1068 None),
1069 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001070 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 '\xcc\x81'),
1072 # 3.34 Left-to-right mark U+200E.
1073 ('\xe2\x80\x8e',
1074 None),
1075 # 3.35 Deprecated U+202A.
1076 ('\xe2\x80\xaa',
1077 None),
1078 # 3.36 Language tagging character U+E0001.
1079 ('\xf3\xa0\x80\x81',
1080 None),
1081 # 3.37 Language tagging character U+E0042.
1082 ('\xf3\xa0\x81\x82',
1083 None),
1084 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1085 ('foo\xd6\xbebar',
1086 None),
1087 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1088 ('foo\xef\xb5\x90bar',
1089 None),
1090 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1091 ('foo\xef\xb9\xb6bar',
1092 'foo \xd9\x8ebar'),
1093 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1094 ('\xd8\xa71',
1095 None),
1096 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1097 ('\xd8\xa71\xd8\xa8',
1098 '\xd8\xa71\xd8\xa8'),
1099 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001100 # Skip this test as we allow unassigned
1101 #('\xf3\xa0\x80\x82',
1102 # None),
1103 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 # 3.44 Larger test (shrinking).
1105 # Original test case reads \xc3\xdf
1106 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1107 '\xaa\xce\xb0\xe2\x80\x80',
1108 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1109 # 3.45 Larger test (expanding).
1110 # Original test case reads \xc3\x9f
1111 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1112 '\x80',
1113 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1114 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1115 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1116 ]
1117
1118
1119class NameprepTest(unittest.TestCase):
1120 def test_nameprep(self):
1121 from encodings.idna import nameprep
1122 for pos, (orig, prepped) in enumerate(nameprep_tests):
1123 if orig is None:
1124 # Skipped
1125 continue
1126 # The Unicode strings are given in UTF-8
1127 orig = unicode(orig, "utf-8")
1128 if prepped is None:
1129 # Input contains prohibited characters
1130 self.assertRaises(UnicodeError, nameprep, orig)
1131 else:
1132 prepped = unicode(prepped, "utf-8")
1133 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001134 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 except Exception,e:
1136 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1137
Walter Dörwald78a0be62006-04-14 18:25:39 +00001138class IDNACodecTest(unittest.TestCase):
1139 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001140 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1141 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1142 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1143 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001144
1145 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001146 self.assertEqual(u"python.org".encode("idna"), "python.org")
1147 self.assertEqual("python.org.".encode("idna"), "python.org.")
1148 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1149 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001150
Martin v. Löwis8b595142005-08-25 11:03:38 +00001151 def test_stream(self):
1152 import StringIO
1153 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1154 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001155 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001156
Walter Dörwald78a0be62006-04-14 18:25:39 +00001157 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001158 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001159 "".join(codecs.iterdecode("python.org", "idna")),
1160 u"python.org"
1161 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001162 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001163 "".join(codecs.iterdecode("python.org.", "idna")),
1164 u"python.org."
1165 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001166 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001167 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1168 u"pyth\xf6n.org."
1169 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001170 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001171 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1172 u"pyth\xf6n.org."
1173 )
1174
1175 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001176 self.assertEqual(decoder.decode("xn--xam", ), u"")
1177 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1178 self.assertEqual(decoder.decode(u"rg"), u"")
1179 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001180
1181 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001182 self.assertEqual(decoder.decode("xn--xam", ), u"")
1183 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1184 self.assertEqual(decoder.decode("rg."), u"org.")
1185 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001186
1187 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001188 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001189 "".join(codecs.iterencode(u"python.org", "idna")),
1190 "python.org"
1191 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001192 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001193 "".join(codecs.iterencode(u"python.org.", "idna")),
1194 "python.org."
1195 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001196 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001197 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1198 "xn--pythn-mua.org."
1199 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001200 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001201 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1202 "xn--pythn-mua.org."
1203 )
1204
1205 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001206 self.assertEqual(encoder.encode(u"\xe4x"), "")
1207 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1208 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001209
1210 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001211 self.assertEqual(encoder.encode(u"\xe4x"), "")
1212 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1213 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001214
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001215class CodecsModuleTest(unittest.TestCase):
1216
1217 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001218 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001219 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001220 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001221 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001222 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1223
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001224 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001225 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001226 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001227 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001228 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001229 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001230 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1231
1232 def test_register(self):
1233 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001234 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001235
1236 def test_lookup(self):
1237 self.assertRaises(TypeError, codecs.lookup)
1238 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001239 self.assertRaises(LookupError, codecs.lookup, " ")
1240
1241 def test_getencoder(self):
1242 self.assertRaises(TypeError, codecs.getencoder)
1243 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1244
1245 def test_getdecoder(self):
1246 self.assertRaises(TypeError, codecs.getdecoder)
1247 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1248
1249 def test_getreader(self):
1250 self.assertRaises(TypeError, codecs.getreader)
1251 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1252
1253 def test_getwriter(self):
1254 self.assertRaises(TypeError, codecs.getwriter)
1255 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001256
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001257 def test_lookup_issue1813(self):
1258 # Issue #1813: under Turkish locales, lookup of some codecs failed
1259 # because 'I' is lowercased as a dotless "i"
1260 oldlocale = locale.getlocale(locale.LC_CTYPE)
1261 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1262 try:
1263 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1264 except locale.Error:
1265 # Unsupported locale on this system
1266 self.skipTest('test needs Turkish locale')
1267 c = codecs.lookup('ASCII')
1268 self.assertEqual(c.name, 'ascii')
1269
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001270class StreamReaderTest(unittest.TestCase):
1271
1272 def setUp(self):
1273 self.reader = codecs.getreader('utf-8')
1274 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1275
1276 def test_readlines(self):
1277 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001278 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001279
Georg Brandl8f99f812006-10-29 08:39:22 +00001280class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001281
Georg Brandl8f99f812006-10-29 08:39:22 +00001282 def test_basic(self):
1283 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001284 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001285 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001286
1287 f = StringIO.StringIO()
1288 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1289 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001290 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001291
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001292class Str2StrTest(unittest.TestCase):
1293
1294 def test_read(self):
1295 sin = "\x80".encode("base64_codec")
1296 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1297 sout = reader.read()
1298 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001299 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001300
1301 def test_readline(self):
1302 sin = "\x80".encode("base64_codec")
1303 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1304 sout = reader.readline()
1305 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001306 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001307
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001308all_unicode_encodings = [
1309 "ascii",
1310 "base64_codec",
1311 "big5",
1312 "big5hkscs",
1313 "charmap",
1314 "cp037",
1315 "cp1006",
1316 "cp1026",
1317 "cp1140",
1318 "cp1250",
1319 "cp1251",
1320 "cp1252",
1321 "cp1253",
1322 "cp1254",
1323 "cp1255",
1324 "cp1256",
1325 "cp1257",
1326 "cp1258",
1327 "cp424",
1328 "cp437",
1329 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001330 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001331 "cp737",
1332 "cp775",
1333 "cp850",
1334 "cp852",
1335 "cp855",
1336 "cp856",
1337 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001338 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001339 "cp860",
1340 "cp861",
1341 "cp862",
1342 "cp863",
1343 "cp864",
1344 "cp865",
1345 "cp866",
1346 "cp869",
1347 "cp874",
1348 "cp875",
1349 "cp932",
1350 "cp949",
1351 "cp950",
1352 "euc_jis_2004",
1353 "euc_jisx0213",
1354 "euc_jp",
1355 "euc_kr",
1356 "gb18030",
1357 "gb2312",
1358 "gbk",
1359 "hex_codec",
1360 "hp_roman8",
1361 "hz",
1362 "idna",
1363 "iso2022_jp",
1364 "iso2022_jp_1",
1365 "iso2022_jp_2",
1366 "iso2022_jp_2004",
1367 "iso2022_jp_3",
1368 "iso2022_jp_ext",
1369 "iso2022_kr",
1370 "iso8859_1",
1371 "iso8859_10",
1372 "iso8859_11",
1373 "iso8859_13",
1374 "iso8859_14",
1375 "iso8859_15",
1376 "iso8859_16",
1377 "iso8859_2",
1378 "iso8859_3",
1379 "iso8859_4",
1380 "iso8859_5",
1381 "iso8859_6",
1382 "iso8859_7",
1383 "iso8859_8",
1384 "iso8859_9",
1385 "johab",
1386 "koi8_r",
1387 "koi8_u",
1388 "latin_1",
1389 "mac_cyrillic",
1390 "mac_greek",
1391 "mac_iceland",
1392 "mac_latin2",
1393 "mac_roman",
1394 "mac_turkish",
1395 "palmos",
1396 "ptcp154",
1397 "punycode",
1398 "raw_unicode_escape",
1399 "rot_13",
1400 "shift_jis",
1401 "shift_jis_2004",
1402 "shift_jisx0213",
1403 "tis_620",
1404 "unicode_escape",
1405 "unicode_internal",
1406 "utf_16",
1407 "utf_16_be",
1408 "utf_16_le",
1409 "utf_7",
1410 "utf_8",
1411]
1412
1413if hasattr(codecs, "mbcs_encode"):
1414 all_unicode_encodings.append("mbcs")
1415
1416# The following encodings work only with str, not unicode
1417all_string_encodings = [
1418 "quopri_codec",
1419 "string_escape",
1420 "uu_codec",
1421]
1422
1423# The following encoding is not tested, because it's not supposed
1424# to work:
1425# "undefined"
1426
1427# The following encodings don't work in stateful mode
1428broken_unicode_with_streams = [
1429 "base64_codec",
1430 "hex_codec",
1431 "punycode",
1432 "unicode_internal"
1433]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001434broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001435
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001436# The following encodings only support "strict" mode
1437only_strict_mode = [
1438 "idna",
1439 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001440 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001441]
1442
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001443try:
1444 import bz2
1445except ImportError:
1446 pass
1447else:
1448 all_unicode_encodings.append("bz2_codec")
1449 broken_unicode_with_streams.append("bz2_codec")
1450
1451try:
1452 import zlib
1453except ImportError:
1454 pass
1455else:
1456 all_unicode_encodings.append("zlib_codec")
1457 broken_unicode_with_streams.append("zlib_codec")
1458
1459class BasicUnicodeTest(unittest.TestCase):
1460 def test_basics(self):
1461 s = u"abc123" # all codecs should be able to encode these
1462 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001463 name = codecs.lookup(encoding).name
1464 if encoding.endswith("_codec"):
1465 name += "_codec"
1466 elif encoding == "latin_1":
1467 name = "latin_1"
1468 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001469 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001470 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001471 (chars, size) = codecs.getdecoder(encoding)(bytes)
1472 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1473
1474 if encoding not in broken_unicode_with_streams:
1475 # check stream reader/writer
1476 q = Queue()
1477 writer = codecs.getwriter(encoding)(q)
1478 encodedresult = ""
1479 for c in s:
1480 writer.write(c)
1481 encodedresult += q.read()
1482 q = Queue()
1483 reader = codecs.getreader(encoding)(q)
1484 decodedresult = u""
1485 for c in encodedresult:
1486 q.write(c)
1487 decodedresult += reader.read()
1488 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1489
Georg Brandl2c9838e2006-10-29 14:39:09 +00001490 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001491 # check incremental decoder/encoder (fetched via the Python
1492 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001493 try:
1494 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001495 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001496 except LookupError: # no IncrementalEncoder
1497 pass
1498 else:
1499 # check incremental decoder/encoder
1500 encodedresult = ""
1501 for c in s:
1502 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001503 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001504 decoder = codecs.getincrementaldecoder(encoding)()
1505 decodedresult = u""
1506 for c in encodedresult:
1507 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001508 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001509 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1510
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001511 # check C API
1512 encodedresult = ""
1513 for c in s:
1514 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001515 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001516 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1517 decodedresult = u""
1518 for c in encodedresult:
1519 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001520 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001521 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1522
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001523 # check iterencode()/iterdecode()
1524 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1525 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1526
1527 # check iterencode()/iterdecode() with empty string
1528 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1529 self.assertEqual(result, u"")
1530
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001531 if encoding not in only_strict_mode:
1532 # check incremental decoder/encoder with errors argument
1533 try:
1534 encoder = codecs.getincrementalencoder(encoding)("ignore")
1535 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1536 except LookupError: # no IncrementalEncoder
1537 pass
1538 else:
1539 encodedresult = "".join(encoder.encode(c) for c in s)
1540 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1541 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1542 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001543
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001544 encodedresult = "".join(cencoder.encode(c) for c in s)
1545 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1546 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1547 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1548
Walter Dörwald729c31f2005-03-14 19:06:30 +00001549 def test_seek(self):
1550 # all codecs should be able to encode these
1551 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1552 for encoding in all_unicode_encodings:
1553 if encoding == "idna": # FIXME: See SF bug #1163178
1554 continue
1555 if encoding in broken_unicode_with_streams:
1556 continue
1557 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1558 for t in xrange(5):
1559 # Test that calling seek resets the internal codec state and buffers
1560 reader.seek(0, 0)
1561 line = reader.readline()
1562 self.assertEqual(s[:len(line)], line)
1563
Walter Dörwalde22d3392005-11-17 08:52:34 +00001564 def test_bad_decode_args(self):
1565 for encoding in all_unicode_encodings:
1566 decoder = codecs.getdecoder(encoding)
1567 self.assertRaises(TypeError, decoder)
1568 if encoding not in ("idna", "punycode"):
1569 self.assertRaises(TypeError, decoder, 42)
1570
1571 def test_bad_encode_args(self):
1572 for encoding in all_unicode_encodings:
1573 encoder = codecs.getencoder(encoding)
1574 self.assertRaises(TypeError, encoder)
1575
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001576 def test_encoding_map_type_initialized(self):
1577 from encodings import cp1140
1578 # This used to crash, we are only verifying there's no crash.
1579 table_type = type(cp1140.encoding_table)
1580 self.assertEqual(table_type, table_type)
1581
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001582class BasicStrTest(unittest.TestCase):
1583 def test_basics(self):
1584 s = "abc123"
1585 for encoding in all_string_encodings:
1586 (bytes, size) = codecs.getencoder(encoding)(s)
1587 self.assertEqual(size, len(s))
1588 (chars, size) = codecs.getdecoder(encoding)(bytes)
1589 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1590
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001591class CharmapTest(unittest.TestCase):
1592 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001593 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001594 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1595 (u"abc", 3)
1596 )
1597
Serhiy Storchaka95997452013-01-15 14:42:59 +02001598 self.assertRaises(UnicodeDecodeError,
1599 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1600 )
1601
1602 self.assertRaises(UnicodeDecodeError,
1603 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1604 )
1605
Ezio Melotti2623a372010-11-21 13:34:58 +00001606 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001607 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1608 (u"ab\ufffd", 3)
1609 )
1610
Ezio Melotti2623a372010-11-21 13:34:58 +00001611 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001612 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1613 (u"ab\ufffd", 3)
1614 )
1615
Ezio Melotti2623a372010-11-21 13:34:58 +00001616 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001617 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1618 (u"ab", 3)
1619 )
1620
Ezio Melotti2623a372010-11-21 13:34:58 +00001621 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001622 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1623 (u"ab", 3)
1624 )
1625
1626 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001627 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001628 codecs.charmap_decode(allbytes, "ignore", u""),
1629 (u"", len(allbytes))
1630 )
1631
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001632 def test_decode_with_int2str_map(self):
1633 self.assertEqual(
1634 codecs.charmap_decode("\x00\x01\x02", "strict",
1635 {0: u'a', 1: u'b', 2: u'c'}),
1636 (u"abc", 3)
1637 )
1638
1639 self.assertEqual(
1640 codecs.charmap_decode("\x00\x01\x02", "strict",
1641 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1642 (u"AaBbCc", 3)
1643 )
1644
1645 self.assertEqual(
1646 codecs.charmap_decode("\x00\x01\x02", "strict",
1647 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1648 (u"\U0010FFFFbc", 3)
1649 )
1650
1651 self.assertEqual(
1652 codecs.charmap_decode("\x00\x01\x02", "strict",
1653 {0: u'a', 1: u'b', 2: u''}),
1654 (u"ab", 3)
1655 )
1656
1657 self.assertRaises(UnicodeDecodeError,
1658 codecs.charmap_decode, "\x00\x01\x02", "strict",
1659 {0: u'a', 1: u'b'}
1660 )
1661
Serhiy Storchaka95997452013-01-15 14:42:59 +02001662 self.assertRaises(UnicodeDecodeError,
1663 codecs.charmap_decode, "\x00\x01\x02", "strict",
1664 {0: u'a', 1: u'b', 2: None}
1665 )
1666
1667 # Issue #14850
1668 self.assertRaises(UnicodeDecodeError,
1669 codecs.charmap_decode, "\x00\x01\x02", "strict",
1670 {0: u'a', 1: u'b', 2: u'\ufffe'}
1671 )
1672
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001673 self.assertEqual(
1674 codecs.charmap_decode("\x00\x01\x02", "replace",
1675 {0: u'a', 1: u'b'}),
1676 (u"ab\ufffd", 3)
1677 )
1678
1679 self.assertEqual(
1680 codecs.charmap_decode("\x00\x01\x02", "replace",
1681 {0: u'a', 1: u'b', 2: None}),
1682 (u"ab\ufffd", 3)
1683 )
1684
Serhiy Storchaka95997452013-01-15 14:42:59 +02001685 # Issue #14850
1686 self.assertEqual(
1687 codecs.charmap_decode("\x00\x01\x02", "replace",
1688 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1689 (u"ab\ufffd", 3)
1690 )
1691
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001692 self.assertEqual(
1693 codecs.charmap_decode("\x00\x01\x02", "ignore",
1694 {0: u'a', 1: u'b'}),
1695 (u"ab", 3)
1696 )
1697
1698 self.assertEqual(
1699 codecs.charmap_decode("\x00\x01\x02", "ignore",
1700 {0: u'a', 1: u'b', 2: None}),
1701 (u"ab", 3)
1702 )
1703
Serhiy Storchaka95997452013-01-15 14:42:59 +02001704 # Issue #14850
1705 self.assertEqual(
1706 codecs.charmap_decode("\x00\x01\x02", "ignore",
1707 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1708 (u"ab", 3)
1709 )
1710
1711 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001712 self.assertEqual(
1713 codecs.charmap_decode(allbytes, "ignore", {}),
1714 (u"", len(allbytes))
1715 )
1716
1717 def test_decode_with_int2int_map(self):
1718 a = ord(u'a')
1719 b = ord(u'b')
1720 c = ord(u'c')
1721
1722 self.assertEqual(
1723 codecs.charmap_decode("\x00\x01\x02", "strict",
1724 {0: a, 1: b, 2: c}),
1725 (u"abc", 3)
1726 )
1727
1728 # Issue #15379
1729 self.assertEqual(
1730 codecs.charmap_decode("\x00\x01\x02", "strict",
1731 {0: 0x10FFFF, 1: b, 2: c}),
1732 (u"\U0010FFFFbc", 3)
1733 )
1734
1735 self.assertRaises(TypeError,
1736 codecs.charmap_decode, "\x00\x01\x02", "strict",
1737 {0: 0x110000, 1: b, 2: c}
1738 )
1739
1740 self.assertRaises(UnicodeDecodeError,
1741 codecs.charmap_decode, "\x00\x01\x02", "strict",
1742 {0: a, 1: b},
1743 )
1744
Serhiy Storchaka95997452013-01-15 14:42:59 +02001745 self.assertRaises(UnicodeDecodeError,
1746 codecs.charmap_decode, "\x00\x01\x02", "strict",
1747 {0: a, 1: b, 2: 0xFFFE},
1748 )
1749
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001750 self.assertEqual(
1751 codecs.charmap_decode("\x00\x01\x02", "replace",
1752 {0: a, 1: b}),
1753 (u"ab\ufffd", 3)
1754 )
1755
1756 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001757 codecs.charmap_decode("\x00\x01\x02", "replace",
1758 {0: a, 1: b, 2: 0xFFFE}),
1759 (u"ab\ufffd", 3)
1760 )
1761
1762 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001763 codecs.charmap_decode("\x00\x01\x02", "ignore",
1764 {0: a, 1: b}),
1765 (u"ab", 3)
1766 )
1767
Serhiy Storchaka95997452013-01-15 14:42:59 +02001768 self.assertEqual(
1769 codecs.charmap_decode("\x00\x01\x02", "ignore",
1770 {0: a, 1: b, 2: 0xFFFE}),
1771 (u"ab", 3)
1772 )
1773
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001774
Georg Brandl8f99f812006-10-29 08:39:22 +00001775class WithStmtTest(unittest.TestCase):
1776 def test_encodedfile(self):
1777 f = StringIO.StringIO("\xc3\xbc")
1778 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001779 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001780
1781 def test_streamreaderwriter(self):
1782 f = StringIO.StringIO("\xc3\xbc")
1783 info = codecs.lookup("utf-8")
1784 with codecs.StreamReaderWriter(f, info.streamreader,
1785 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001786 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001787
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001788
Victor Stinner262be5e2010-05-22 02:11:07 +00001789class BomTest(unittest.TestCase):
1790 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001791 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001792 tests = ("utf-16",
1793 "utf-16-le",
1794 "utf-16-be",
1795 "utf-32",
1796 "utf-32-le",
1797 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001798 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001799 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001800 # Check if the BOM is written only once
1801 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001802 f.write(data)
1803 f.write(data)
1804 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001805 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001806 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001807 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001808
Victor Stinner7df55da2010-05-22 13:37:56 +00001809 # Check that the BOM is written after a seek(0)
1810 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1811 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001812 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001813 f.seek(0)
1814 f.write(data)
1815 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001816 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001817
1818 # (StreamWriter) Check that the BOM is written after a seek(0)
1819 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1820 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001821 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001822 f.writer.seek(0)
1823 f.writer.write(data)
1824 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001825 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001826
1827 # Check that the BOM is not written after a seek() at a position
1828 # different than the start
1829 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1830 f.write(data)
1831 f.seek(f.tell())
1832 f.write(data)
1833 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001834 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001835
1836 # (StreamWriter) Check that the BOM is not written after a seek()
1837 # at a position different than the start
1838 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1839 f.writer.write(data)
1840 f.writer.seek(f.writer.tell())
1841 f.writer.write(data)
1842 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001843 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001844
Victor Stinner262be5e2010-05-22 02:11:07 +00001845
Fred Drake2e2be372001-09-20 21:33:42 +00001846def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001847 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001848 UTF32Test,
1849 UTF32LETest,
1850 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001851 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001852 UTF16LETest,
1853 UTF16BETest,
1854 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001855 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001856 UTF7Test,
1857 UTF16ExTest,
1858 ReadBufferTest,
1859 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001860 EscapeDecodeTest,
1861 RecodingTest,
1862 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001863 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001864 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001865 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001866 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001867 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001868 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001869 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001870 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001871 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001872 CharmapTest,
1873 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001874 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001875 )
Fred Drake2e2be372001-09-20 21:33:42 +00001876
1877
1878if __name__ == "__main__":
1879 test_main()