blob: f620b4cde370bad50f9973c4ed05678ffe43e4d8 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
11 def __init__(self):
12 self._buffer = ""
13
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
20 self._buffer = ""
21 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwalde57d7b12004-12-21 22:24:00 +000027class ReadTest(unittest.TestCase):
28 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000029 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000030 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000031 # the StreamReader and check that the results equal the appropriate
32 # entries from partialresults.
33 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000034 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000035 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000036 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000037 q.write(c)
38 result += r.read()
39 self.assertEqual(result, partialresult)
40 # check that there's nothing left in the buffers
41 self.assertEqual(r.read(), u"")
42 self.assertEqual(r.bytebuffer, "")
43 self.assertEqual(r.charbuffer, u"")
44
Walter Dörwaldabb02e52006-03-15 11:35:15 +000045 # do the check again, this time using a incremental decoder
46 d = codecs.getincrementaldecoder(self.encoding)()
47 result = u""
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
49 result += d.decode(c)
50 self.assertEqual(result, partialresult)
51 # check that there's nothing left in the buffers
52 self.assertEqual(d.decode("", True), u"")
53 self.assertEqual(d.buffer, "")
54
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000055 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000056 d.reset()
57 result = u""
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
59 result += d.decode(c)
60 self.assertEqual(result, partialresult)
61 # check that there's nothing left in the buffers
62 self.assertEqual(d.decode("", True), u"")
63 self.assertEqual(d.buffer, "")
64
65 # check iterdecode()
66 encoded = input.encode(self.encoding)
67 self.assertEqual(
68 input,
69 u"".join(codecs.iterdecode(encoded, self.encoding))
70 )
71
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 def test_readline(self):
73 def getreader(input):
74 stream = StringIO.StringIO(input.encode(self.encoding))
75 return codecs.getreader(self.encoding)(stream)
76
Walter Dörwaldca199432006-03-06 22:39:12 +000077 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 reader = getreader(input)
79 lines = []
80 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000081 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 if not line:
83 break
84 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000085 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000088 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
89 sexpectednoends = u"foo|bar|baz|spam|eggs"
90 self.assertEqual(readalllines(s, True), sexpected)
91 self.assertEqual(readalllines(s, False), sexpectednoends)
92 self.assertEqual(readalllines(s, True, 10), sexpected)
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000094
95 # Test long lines (multiple calls to read() in readline())
96 vw = []
97 vwo = []
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
99 vw.append((i*200)*u"\3042" + lineend)
100 vwo.append((i*200)*u"\3042")
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103
104 # Test lines where the first read might end with \r, so the
105 # reader has to look ahead whether this is a lone \r or a \r\n
106 for size in xrange(80):
107 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000108 s = 10*(size*u"a" + lineend + u"xxx\n")
109 reader = getreader(s)
110 for i in xrange(10):
111 self.assertEqual(
112 reader.readline(keepends=True),
113 size*u"a" + lineend,
114 )
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=False),
119 size*u"a",
120 )
121
122 def test_bug1175396(self):
123 s = [
124 '<%!--===================================================\r\n',
125 ' BLOG index page: show recent articles,\r\n',
126 ' today\'s articles, or articles of a specific date.\r\n',
127 '========================================================--%>\r\n',
128 '<%@inputencoding="ISO-8859-1"%>\r\n',
129 '<%@pagetemplate=TEMPLATE.y%>\r\n',
130 '<%@import=import frog.util, frog%>\r\n',
131 '<%@import=import frog.objects%>\r\n',
132 '<%@import=from frog.storageerrors import StorageError%>\r\n',
133 '<%\r\n',
134 '\r\n',
135 'import logging\r\n',
136 'log=logging.getLogger("Snakelets.logger")\r\n',
137 '\r\n',
138 '\r\n',
139 'user=self.SessionCtx.user\r\n',
140 'storageEngine=self.SessionCtx.storageEngine\r\n',
141 '\r\n',
142 '\r\n',
143 'def readArticlesFromDate(date, count=None):\r\n',
144 ' entryids=storageEngine.listBlogEntries(date)\r\n',
145 ' entryids.reverse() # descending\r\n',
146 ' if count:\r\n',
147 ' entryids=entryids[:count]\r\n',
148 ' try:\r\n',
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
150 ' except StorageError,x:\r\n',
151 ' log.error("Error loading articles: "+str(x))\r\n',
152 ' self.abort("cannot load articles")\r\n',
153 '\r\n',
154 'showdate=None\r\n',
155 '\r\n',
156 'arg=self.Request.getArg()\r\n',
157 'if arg=="today":\r\n',
158 ' #-------------------- TODAY\'S ARTICLES\r\n',
159 ' self.write("<h2>Today\'s articles</h2>")\r\n',
160 ' showdate = frog.util.isodatestr() \r\n',
161 ' entries = readArticlesFromDate(showdate)\r\n',
162 'elif arg=="active":\r\n',
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
164 ' self.Yredirect("active.y")\r\n',
165 'elif arg=="login":\r\n',
166 ' #-------------------- LOGIN PAGE redirect\r\n',
167 ' self.Yredirect("login.y")\r\n',
168 'elif arg=="date":\r\n',
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
170 ' showdate = self.Request.getParameter("date")\r\n',
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
172 ' entries = readArticlesFromDate(showdate)\r\n',
173 'else:\r\n',
174 ' #-------------------- RECENT ARTICLES\r\n',
175 ' self.write("<h2>Recent articles</h2>")\r\n',
176 ' dates=storageEngine.listBlogEntryDates()\r\n',
177 ' if dates:\r\n',
178 ' entries=[]\r\n',
179 ' SHOWAMOUNT=10\r\n',
180 ' for showdate in dates:\r\n',
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
182 ' if len(entries)>=SHOWAMOUNT:\r\n',
183 ' break\r\n',
184 ' \r\n',
185 ]
186 stream = StringIO.StringIO("".join(s).encode(self.encoding))
187 reader = codecs.getreader(self.encoding)(stream)
188 for (i, line) in enumerate(reader):
189 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000190
191 def test_readlinequeue(self):
192 q = Queue()
193 writer = codecs.getwriter(self.encoding)(q)
194 reader = codecs.getreader(self.encoding)(q)
195
196 # No lineends
197 writer.write(u"foo\r")
198 self.assertEqual(reader.readline(keepends=False), u"foo")
199 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000200 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000201 self.assertEqual(reader.readline(keepends=False), u"bar")
202 writer.write(u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"baz")
204 self.assertEqual(reader.readline(keepends=False), u"")
205
206 # Lineends
207 writer.write(u"foo\r")
208 self.assertEqual(reader.readline(keepends=True), u"foo\r")
209 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000210 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000211 self.assertEqual(reader.readline(keepends=True), u"bar\r")
212 writer.write(u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"baz")
214 self.assertEqual(reader.readline(keepends=True), u"")
215 writer.write(u"foo\r\n")
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217
Walter Dörwald9fa09462005-01-10 12:01:39 +0000218 def test_bug1098990_a(self):
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
221 s3 = u"next line.\r\n"
222
223 s = (s1+s2+s3).encode(self.encoding)
224 stream = StringIO.StringIO(s)
225 reader = codecs.getreader(self.encoding)(stream)
226 self.assertEqual(reader.readline(), s1)
227 self.assertEqual(reader.readline(), s2)
228 self.assertEqual(reader.readline(), s3)
229 self.assertEqual(reader.readline(), u"")
230
231 def test_bug1098990_b(self):
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
234 s3 = u"stillokay:bbbbxx\r\n"
235 s4 = u"broken!!!!badbad\r\n"
236 s5 = u"againokay.\r\n"
237
238 s = (s1+s2+s3+s4+s5).encode(self.encoding)
239 stream = StringIO.StringIO(s)
240 reader = codecs.getreader(self.encoding)(stream)
241 self.assertEqual(reader.readline(), s1)
242 self.assertEqual(reader.readline(), s2)
243 self.assertEqual(reader.readline(), s3)
244 self.assertEqual(reader.readline(), s4)
245 self.assertEqual(reader.readline(), s5)
246 self.assertEqual(reader.readline(), u"")
247
Walter Dörwald6e390802007-08-17 16:41:28 +0000248class UTF32Test(ReadTest):
249 encoding = "utf-32"
250
251 spamle = ('\xff\xfe\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
253 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
254 spambe = ('\x00\x00\xfe\xff'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
256 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
257
258 def test_only_one_bom(self):
259 _,_,reader,writer = codecs.lookup(self.encoding)
260 # encode some stream
261 s = StringIO.StringIO()
262 f = writer(s)
263 f.write(u"spam")
264 f.write(u"spam")
265 d = s.getvalue()
266 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000267 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000268 # try to read it back
269 s = StringIO.StringIO(d)
270 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000271 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000272
273 def test_badbom(self):
274 s = StringIO.StringIO(4*"\xff")
275 f = codecs.getreader(self.encoding)(s)
276 self.assertRaises(UnicodeError, f.read)
277
278 s = StringIO.StringIO(8*"\xff")
279 f = codecs.getreader(self.encoding)(s)
280 self.assertRaises(UnicodeError, f.read)
281
282 def test_partial(self):
283 self.check_partial(
284 u"\x00\xff\u0100\uffff",
285 [
286 u"", # first byte of BOM read
287 u"", # second byte of BOM read
288 u"", # third byte of BOM read
289 u"", # fourth byte of BOM read => byteorder known
290 u"",
291 u"",
292 u"",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100",
305 u"\x00\xff\u0100\uffff",
306 ]
307 )
308
Georg Brandle9741f32009-09-17 11:28:09 +0000309 def test_handlers(self):
310 self.assertEqual((u'\ufffd', 1),
311 codecs.utf_32_decode('\x01', 'replace', True))
312 self.assertEqual((u'', 1),
313 codecs.utf_32_decode('\x01', 'ignore', True))
314
Walter Dörwald6e390802007-08-17 16:41:28 +0000315 def test_errors(self):
316 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
317 "\xff", "strict", True)
318
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000319 def test_issue8941(self):
320 # Issue #8941: insufficient result allocation when decoding into
321 # surrogate pairs on UCS-2 builds.
322 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
323 self.assertEqual(u'\U00010000' * 1024,
324 codecs.utf_32_decode(encoded_le)[0])
325 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
326 self.assertEqual(u'\U00010000' * 1024,
327 codecs.utf_32_decode(encoded_be)[0])
328
Walter Dörwald6e390802007-08-17 16:41:28 +0000329class UTF32LETest(ReadTest):
330 encoding = "utf-32-le"
331
332 def test_partial(self):
333 self.check_partial(
334 u"\x00\xff\u0100\uffff",
335 [
336 u"",
337 u"",
338 u"",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100\uffff",
352 ]
353 )
354
355 def test_simple(self):
356 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
357
358 def test_errors(self):
359 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
360 "\xff", "strict", True)
361
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000362 def test_issue8941(self):
363 # Issue #8941: insufficient result allocation when decoding into
364 # surrogate pairs on UCS-2 builds.
365 encoded = '\x00\x00\x01\x00' * 1024
366 self.assertEqual(u'\U00010000' * 1024,
367 codecs.utf_32_le_decode(encoded)[0])
368
Walter Dörwald6e390802007-08-17 16:41:28 +0000369class UTF32BETest(ReadTest):
370 encoding = "utf-32-be"
371
372 def test_partial(self):
373 self.check_partial(
374 u"\x00\xff\u0100\uffff",
375 [
376 u"",
377 u"",
378 u"",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100",
391 u"\x00\xff\u0100\uffff",
392 ]
393 )
394
395 def test_simple(self):
396 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
397
398 def test_errors(self):
399 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
400 "\xff", "strict", True)
401
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000402 def test_issue8941(self):
403 # Issue #8941: insufficient result allocation when decoding into
404 # surrogate pairs on UCS-2 builds.
405 encoded = '\x00\x01\x00\x00' * 1024
406 self.assertEqual(u'\U00010000' * 1024,
407 codecs.utf_32_be_decode(encoded)[0])
408
409
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000410class UTF16Test(ReadTest):
411 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000412
413 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
414 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
415
416 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000417 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000418 # encode some stream
419 s = StringIO.StringIO()
420 f = writer(s)
421 f.write(u"spam")
422 f.write(u"spam")
423 d = s.getvalue()
424 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000425 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000426 # try to read it back
427 s = StringIO.StringIO(d)
428 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000429 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000430
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000431 def test_badbom(self):
432 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000433 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000434 self.assertRaises(UnicodeError, f.read)
435
436 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000437 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000438 self.assertRaises(UnicodeError, f.read)
439
Walter Dörwald69652032004-09-07 20:24:22 +0000440 def test_partial(self):
441 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000442 u"\x00\xff\u0100\uffff",
443 [
444 u"", # first byte of BOM read
445 u"", # second byte of BOM read => byteorder known
446 u"",
447 u"\x00",
448 u"\x00",
449 u"\x00\xff",
450 u"\x00\xff",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100",
453 u"\x00\xff\u0100\uffff",
454 ]
455 )
456
Georg Brandle9741f32009-09-17 11:28:09 +0000457 def test_handlers(self):
458 self.assertEqual((u'\ufffd', 1),
459 codecs.utf_16_decode('\x01', 'replace', True))
460 self.assertEqual((u'', 1),
461 codecs.utf_16_decode('\x01', 'ignore', True))
462
Walter Dörwalde22d3392005-11-17 08:52:34 +0000463 def test_errors(self):
464 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
465
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000466 def test_bug691291(self):
467 # Files are always opened in binary mode, even if no binary mode was
468 # specified. This means that no automatic conversion of '\n' is done
469 # on reading and writing.
470 s1 = u'Hello\r\nworld\r\n'
471
472 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200473 self.addCleanup(test_support.unlink, test_support.TESTFN)
474 with open(test_support.TESTFN, 'wb') as fp:
475 fp.write(s)
476 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
477 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000478
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000479class UTF16LETest(ReadTest):
480 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000481
482 def test_partial(self):
483 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000484 u"\x00\xff\u0100\uffff",
485 [
486 u"",
487 u"\x00",
488 u"\x00",
489 u"\x00\xff",
490 u"\x00\xff",
491 u"\x00\xff\u0100",
492 u"\x00\xff\u0100",
493 u"\x00\xff\u0100\uffff",
494 ]
495 )
496
Walter Dörwalde22d3392005-11-17 08:52:34 +0000497 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200498 tests = [
499 (b'\xff', u'\ufffd'),
500 (b'A\x00Z', u'A\ufffd'),
501 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
502 (b'\x00\xd8', u'\ufffd'),
503 (b'\x00\xd8A', u'\ufffd'),
504 (b'\x00\xd8A\x00', u'\ufffdA'),
505 (b'\x00\xdcA\x00', u'\ufffdA'),
506 ]
507 for raw, expected in tests:
508 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
509 raw, 'strict', True)
510 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000511
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000512class UTF16BETest(ReadTest):
513 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000514
515 def test_partial(self):
516 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000517 u"\x00\xff\u0100\uffff",
518 [
519 u"",
520 u"\x00",
521 u"\x00",
522 u"\x00\xff",
523 u"\x00\xff",
524 u"\x00\xff\u0100",
525 u"\x00\xff\u0100",
526 u"\x00\xff\u0100\uffff",
527 ]
528 )
529
Walter Dörwalde22d3392005-11-17 08:52:34 +0000530 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200531 tests = [
532 (b'\xff', u'\ufffd'),
533 (b'\x00A\xff', u'A\ufffd'),
534 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
535 (b'\xd8\x00', u'\ufffd'),
536 (b'\xd8\x00\xdc', u'\ufffd'),
537 (b'\xd8\x00\x00A', u'\ufffdA'),
538 (b'\xdc\x00\x00A', u'\ufffdA'),
539 ]
540 for raw, expected in tests:
541 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
542 raw, 'strict', True)
543 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545class UTF8Test(ReadTest):
546 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000547
548 def test_partial(self):
549 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000550 u"\x00\xff\u07ff\u0800\uffff",
551 [
552 u"\x00",
553 u"\x00",
554 u"\x00\xff",
555 u"\x00\xff",
556 u"\x00\xff\u07ff",
557 u"\x00\xff\u07ff",
558 u"\x00\xff\u07ff",
559 u"\x00\xff\u07ff\u0800",
560 u"\x00\xff\u07ff\u0800",
561 u"\x00\xff\u07ff\u0800",
562 u"\x00\xff\u07ff\u0800\uffff",
563 ]
564 )
565
Walter Dörwalde22d3392005-11-17 08:52:34 +0000566class UTF7Test(ReadTest):
567 encoding = "utf-7"
568
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000569 def test_partial(self):
570 self.check_partial(
571 u"a+-b",
572 [
573 u"a",
574 u"a",
575 u"a+",
576 u"a+-",
577 u"a+-b",
578 ]
579 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000580
581class UTF16ExTest(unittest.TestCase):
582
583 def test_errors(self):
584 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
585
586 def test_bad_args(self):
587 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
588
589class ReadBufferTest(unittest.TestCase):
590
591 def test_array(self):
592 import array
593 self.assertEqual(
594 codecs.readbuffer_encode(array.array("c", "spam")),
595 ("spam", 4)
596 )
597
598 def test_empty(self):
599 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
600
601 def test_bad_args(self):
602 self.assertRaises(TypeError, codecs.readbuffer_encode)
603 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
604
605class CharBufferTest(unittest.TestCase):
606
607 def test_string(self):
608 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
609
610 def test_empty(self):
611 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
612
613 def test_bad_args(self):
614 self.assertRaises(TypeError, codecs.charbuffer_encode)
615 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
616
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000617class UTF8SigTest(ReadTest):
618 encoding = "utf-8-sig"
619
620 def test_partial(self):
621 self.check_partial(
622 u"\ufeff\x00\xff\u07ff\u0800\uffff",
623 [
624 u"",
625 u"",
626 u"", # First BOM has been read and skipped
627 u"",
628 u"",
629 u"\ufeff", # Second BOM has been read and emitted
630 u"\ufeff\x00", # "\x00" read and emitted
631 u"\ufeff\x00", # First byte of encoded u"\xff" read
632 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
633 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
634 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
635 u"\ufeff\x00\xff\u07ff",
636 u"\ufeff\x00\xff\u07ff",
637 u"\ufeff\x00\xff\u07ff\u0800",
638 u"\ufeff\x00\xff\u07ff\u0800",
639 u"\ufeff\x00\xff\u07ff\u0800",
640 u"\ufeff\x00\xff\u07ff\u0800\uffff",
641 ]
642 )
643
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000644 def test_bug1601501(self):
645 # SF bug #1601501: check that the codec works with a buffer
646 unicode("\xef\xbb\xbf", "utf-8-sig")
647
Walter Dörwald42348272007-04-12 10:35:00 +0000648 def test_bom(self):
649 d = codecs.getincrementaldecoder("utf-8-sig")()
650 s = u"spam"
651 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
652
Walter Dörwald183744d2007-11-19 12:41:10 +0000653 def test_stream_bom(self):
654 unistring = u"ABC\u00A1\u2200XYZ"
655 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
656
657 reader = codecs.getreader("utf-8-sig")
658 for sizehint in [None] + range(1, 11) + \
659 [64, 128, 256, 512, 1024]:
660 istream = reader(StringIO.StringIO(bytestring))
661 ostream = StringIO.StringIO()
662 while 1:
663 if sizehint is not None:
664 data = istream.read(sizehint)
665 else:
666 data = istream.read()
667
668 if not data:
669 break
670 ostream.write(data)
671
672 got = ostream.getvalue()
673 self.assertEqual(got, unistring)
674
675 def test_stream_bare(self):
676 unistring = u"ABC\u00A1\u2200XYZ"
677 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
678
679 reader = codecs.getreader("utf-8-sig")
680 for sizehint in [None] + range(1, 11) + \
681 [64, 128, 256, 512, 1024]:
682 istream = reader(StringIO.StringIO(bytestring))
683 ostream = StringIO.StringIO()
684 while 1:
685 if sizehint is not None:
686 data = istream.read(sizehint)
687 else:
688 data = istream.read()
689
690 if not data:
691 break
692 ostream.write(data)
693
694 got = ostream.getvalue()
695 self.assertEqual(got, unistring)
696
Walter Dörwald8709a422002-09-03 13:53:40 +0000697class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000698 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000699 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000700
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000701class RecodingTest(unittest.TestCase):
702 def test_recoding(self):
703 f = StringIO.StringIO()
704 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
705 f2.write(u"a")
706 f2.close()
707 # Python used to crash on this at exit because of a refcount
708 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000709
Martin v. Löwis2548c732003-04-18 10:39:54 +0000710# From RFC 3492
711punycode_testcases = [
712 # A Arabic (Egyptian):
713 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
714 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
715 "egbpdaj6bu4bxfgehfvwxn"),
716 # B Chinese (simplified):
717 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
718 "ihqwcrb4cv8a8dqg056pqjye"),
719 # C Chinese (traditional):
720 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
721 "ihqwctvzc91f659drss3x8bo0yb"),
722 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
723 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
724 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
725 u"\u0065\u0073\u006B\u0079",
726 "Proprostnemluvesky-uyb24dma41a"),
727 # E Hebrew:
728 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
729 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
730 u"\u05D1\u05E8\u05D9\u05EA",
731 "4dbcagdahymbxekheh6e0a7fei0b"),
732 # F Hindi (Devanagari):
733 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
734 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
735 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
736 u"\u0939\u0948\u0902",
737 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
738
739 #(G) Japanese (kanji and hiragana):
740 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
741 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
742 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
743
744 # (H) Korean (Hangul syllables):
745 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
746 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
747 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
748 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
749 "psd879ccm6fea98c"),
750
751 # (I) Russian (Cyrillic):
752 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
753 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
754 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
755 u"\u0438",
756 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
757
758 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
759 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
760 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
761 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
762 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
763 u"\u0061\u00F1\u006F\u006C",
764 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
765
766 # (K) Vietnamese:
767 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
768 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
769 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
770 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
771 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
772 u"\u0056\u0069\u1EC7\u0074",
773 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
774
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775 #(L) 3<nen>B<gumi><kinpachi><sensei>
776 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
777 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000778
Martin v. Löwis2548c732003-04-18 10:39:54 +0000779 # (M) <amuro><namie>-with-SUPER-MONKEYS
780 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
781 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
782 u"\u004F\u004E\u004B\u0045\u0059\u0053",
783 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
784
785 # (N) Hello-Another-Way-<sorezore><no><basho>
786 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
787 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
788 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
789 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
790
791 # (O) <hitotsu><yane><no><shita>2
792 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
793 "2-u9tlzr9756bt3uc0v"),
794
795 # (P) Maji<de>Koi<suru>5<byou><mae>
796 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
797 u"\u308B\u0035\u79D2\u524D",
798 "MajiKoi5-783gue6qz075azm5e"),
799
800 # (Q) <pafii>de<runba>
801 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
802 "de-jg4avhby1noc0d"),
803
804 # (R) <sono><supiido><de>
805 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
806 "d9juau41awczczp"),
807
808 # (S) -> $1.00 <-
809 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
810 u"\u003C\u002D",
811 "-> $1.00 <--")
812 ]
813
814for i in punycode_testcases:
815 if len(i)!=2:
816 print repr(i)
817
818class PunycodeTest(unittest.TestCase):
819 def test_encode(self):
820 for uni, puny in punycode_testcases:
821 # Need to convert both strings to lower case, since
822 # some of the extended encodings use upper case, but our
823 # code produces only lower case. Converting just puny to
824 # lower is also insufficient, since some of the input characters
825 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000826 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000827
828 def test_decode(self):
829 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000830 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000831
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000832class UnicodeInternalTest(unittest.TestCase):
833 def test_bug1251300(self):
834 # Decoding with unicode_internal used to not correctly handle "code
835 # points" above 0x10ffff on UCS-4 builds.
836 if sys.maxunicode > 0xffff:
837 ok = [
838 ("\x00\x10\xff\xff", u"\U0010ffff"),
839 ("\x00\x00\x01\x01", u"\U00000101"),
840 ("", u""),
841 ]
842 not_ok = [
843 "\x7f\xff\xff\xff",
844 "\x80\x00\x00\x00",
845 "\x81\x00\x00\x00",
846 "\x00",
847 "\x00\x00\x00\x00\x00",
848 ]
849 for internal, uni in ok:
850 if sys.byteorder == "little":
851 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000852 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000853 for internal in not_ok:
854 if sys.byteorder == "little":
855 internal = "".join(reversed(internal))
856 self.assertRaises(UnicodeDecodeError, internal.decode,
857 "unicode_internal")
858
859 def test_decode_error_attributes(self):
860 if sys.maxunicode > 0xffff:
861 try:
862 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
863 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000864 self.assertEqual("unicode_internal", ex.encoding)
865 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
866 self.assertEqual(4, ex.start)
867 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000868 else:
869 self.fail()
870
871 def test_decode_callback(self):
872 if sys.maxunicode > 0xffff:
873 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
874 decoder = codecs.getdecoder("unicode_internal")
875 ab = u"ab".encode("unicode_internal")
876 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
877 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000878 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000879
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000880 def test_encode_length(self):
881 # Issue 3739
882 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000883 self.assertEqual(encoder(u"a")[1], 1)
884 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000885
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000886 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000887 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000888
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
890nameprep_tests = [
891 # 3.1 Map to nothing.
892 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
893 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
894 '\xb8\x8f\xef\xbb\xbf',
895 'foobarbaz'),
896 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
897 ('CAFE',
898 'cafe'),
899 # 3.3 Case folding 8bit U+00DF (german sharp s).
900 # The original test case is bogus; it says \xc3\xdf
901 ('\xc3\x9f',
902 'ss'),
903 # 3.4 Case folding U+0130 (turkish capital I with dot).
904 ('\xc4\xb0',
905 'i\xcc\x87'),
906 # 3.5 Case folding multibyte U+0143 U+037A.
907 ('\xc5\x83\xcd\xba',
908 '\xc5\x84 \xce\xb9'),
909 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
910 # XXX: skip this as it fails in UCS-2 mode
911 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
912 # 'telc\xe2\x88\x95kg\xcf\x83'),
913 (None, None),
914 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
915 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
916 '\xc7\xb0 a'),
917 # 3.8 Case folding U+1FB7 and normalization.
918 ('\xe1\xbe\xb7',
919 '\xe1\xbe\xb6\xce\xb9'),
920 # 3.9 Self-reverting case folding U+01F0 and normalization.
921 # The original test case is bogus, it says `\xc7\xf0'
922 ('\xc7\xb0',
923 '\xc7\xb0'),
924 # 3.10 Self-reverting case folding U+0390 and normalization.
925 ('\xce\x90',
926 '\xce\x90'),
927 # 3.11 Self-reverting case folding U+03B0 and normalization.
928 ('\xce\xb0',
929 '\xce\xb0'),
930 # 3.12 Self-reverting case folding U+1E96 and normalization.
931 ('\xe1\xba\x96',
932 '\xe1\xba\x96'),
933 # 3.13 Self-reverting case folding U+1F56 and normalization.
934 ('\xe1\xbd\x96',
935 '\xe1\xbd\x96'),
936 # 3.14 ASCII space character U+0020.
937 (' ',
938 ' '),
939 # 3.15 Non-ASCII 8bit space character U+00A0.
940 ('\xc2\xa0',
941 ' '),
942 # 3.16 Non-ASCII multibyte space character U+1680.
943 ('\xe1\x9a\x80',
944 None),
945 # 3.17 Non-ASCII multibyte space character U+2000.
946 ('\xe2\x80\x80',
947 ' '),
948 # 3.18 Zero Width Space U+200b.
949 ('\xe2\x80\x8b',
950 ''),
951 # 3.19 Non-ASCII multibyte space character U+3000.
952 ('\xe3\x80\x80',
953 ' '),
954 # 3.20 ASCII control characters U+0010 U+007F.
955 ('\x10\x7f',
956 '\x10\x7f'),
957 # 3.21 Non-ASCII 8bit control character U+0085.
958 ('\xc2\x85',
959 None),
960 # 3.22 Non-ASCII multibyte control character U+180E.
961 ('\xe1\xa0\x8e',
962 None),
963 # 3.23 Zero Width No-Break Space U+FEFF.
964 ('\xef\xbb\xbf',
965 ''),
966 # 3.24 Non-ASCII control character U+1D175.
967 ('\xf0\x9d\x85\xb5',
968 None),
969 # 3.25 Plane 0 private use character U+F123.
970 ('\xef\x84\xa3',
971 None),
972 # 3.26 Plane 15 private use character U+F1234.
973 ('\xf3\xb1\x88\xb4',
974 None),
975 # 3.27 Plane 16 private use character U+10F234.
976 ('\xf4\x8f\x88\xb4',
977 None),
978 # 3.28 Non-character code point U+8FFFE.
979 ('\xf2\x8f\xbf\xbe',
980 None),
981 # 3.29 Non-character code point U+10FFFF.
982 ('\xf4\x8f\xbf\xbf',
983 None),
984 # 3.30 Surrogate code U+DF42.
985 ('\xed\xbd\x82',
986 None),
987 # 3.31 Non-plain text character U+FFFD.
988 ('\xef\xbf\xbd',
989 None),
990 # 3.32 Ideographic description character U+2FF5.
991 ('\xe2\xbf\xb5',
992 None),
993 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000994 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 '\xcc\x81'),
996 # 3.34 Left-to-right mark U+200E.
997 ('\xe2\x80\x8e',
998 None),
999 # 3.35 Deprecated U+202A.
1000 ('\xe2\x80\xaa',
1001 None),
1002 # 3.36 Language tagging character U+E0001.
1003 ('\xf3\xa0\x80\x81',
1004 None),
1005 # 3.37 Language tagging character U+E0042.
1006 ('\xf3\xa0\x81\x82',
1007 None),
1008 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1009 ('foo\xd6\xbebar',
1010 None),
1011 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1012 ('foo\xef\xb5\x90bar',
1013 None),
1014 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1015 ('foo\xef\xb9\xb6bar',
1016 'foo \xd9\x8ebar'),
1017 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1018 ('\xd8\xa71',
1019 None),
1020 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1021 ('\xd8\xa71\xd8\xa8',
1022 '\xd8\xa71\xd8\xa8'),
1023 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001024 # Skip this test as we allow unassigned
1025 #('\xf3\xa0\x80\x82',
1026 # None),
1027 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 # 3.44 Larger test (shrinking).
1029 # Original test case reads \xc3\xdf
1030 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1031 '\xaa\xce\xb0\xe2\x80\x80',
1032 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1033 # 3.45 Larger test (expanding).
1034 # Original test case reads \xc3\x9f
1035 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1036 '\x80',
1037 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1038 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1039 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1040 ]
1041
1042
1043class NameprepTest(unittest.TestCase):
1044 def test_nameprep(self):
1045 from encodings.idna import nameprep
1046 for pos, (orig, prepped) in enumerate(nameprep_tests):
1047 if orig is None:
1048 # Skipped
1049 continue
1050 # The Unicode strings are given in UTF-8
1051 orig = unicode(orig, "utf-8")
1052 if prepped is None:
1053 # Input contains prohibited characters
1054 self.assertRaises(UnicodeError, nameprep, orig)
1055 else:
1056 prepped = unicode(prepped, "utf-8")
1057 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001058 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 except Exception,e:
1060 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1061
Walter Dörwald78a0be62006-04-14 18:25:39 +00001062class IDNACodecTest(unittest.TestCase):
1063 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001064 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1065 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1066 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1067 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001068
1069 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001070 self.assertEqual(u"python.org".encode("idna"), "python.org")
1071 self.assertEqual("python.org.".encode("idna"), "python.org.")
1072 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1073 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001074
Martin v. Löwis8b595142005-08-25 11:03:38 +00001075 def test_stream(self):
1076 import StringIO
1077 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1078 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001079 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001080
Walter Dörwald78a0be62006-04-14 18:25:39 +00001081 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001082 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001083 "".join(codecs.iterdecode("python.org", "idna")),
1084 u"python.org"
1085 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001086 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001087 "".join(codecs.iterdecode("python.org.", "idna")),
1088 u"python.org."
1089 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001090 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001091 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1092 u"pyth\xf6n.org."
1093 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001094 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001095 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1096 u"pyth\xf6n.org."
1097 )
1098
1099 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001100 self.assertEqual(decoder.decode("xn--xam", ), u"")
1101 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1102 self.assertEqual(decoder.decode(u"rg"), u"")
1103 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001104
1105 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001106 self.assertEqual(decoder.decode("xn--xam", ), u"")
1107 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1108 self.assertEqual(decoder.decode("rg."), u"org.")
1109 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001110
1111 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001112 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001113 "".join(codecs.iterencode(u"python.org", "idna")),
1114 "python.org"
1115 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001116 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001117 "".join(codecs.iterencode(u"python.org.", "idna")),
1118 "python.org."
1119 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001120 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001121 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1122 "xn--pythn-mua.org."
1123 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001124 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001125 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1126 "xn--pythn-mua.org."
1127 )
1128
1129 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001130 self.assertEqual(encoder.encode(u"\xe4x"), "")
1131 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1132 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001133
1134 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001135 self.assertEqual(encoder.encode(u"\xe4x"), "")
1136 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1137 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001138
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001139class CodecsModuleTest(unittest.TestCase):
1140
1141 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001142 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001143 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001144 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001145 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001146 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1147
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001148 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001149 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001150 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001151 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001152 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001153 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001154 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1155
1156 def test_register(self):
1157 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001158 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001159
1160 def test_lookup(self):
1161 self.assertRaises(TypeError, codecs.lookup)
1162 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001163 self.assertRaises(LookupError, codecs.lookup, " ")
1164
1165 def test_getencoder(self):
1166 self.assertRaises(TypeError, codecs.getencoder)
1167 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1168
1169 def test_getdecoder(self):
1170 self.assertRaises(TypeError, codecs.getdecoder)
1171 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1172
1173 def test_getreader(self):
1174 self.assertRaises(TypeError, codecs.getreader)
1175 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1176
1177 def test_getwriter(self):
1178 self.assertRaises(TypeError, codecs.getwriter)
1179 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001180
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001181 def test_lookup_issue1813(self):
1182 # Issue #1813: under Turkish locales, lookup of some codecs failed
1183 # because 'I' is lowercased as a dotless "i"
1184 oldlocale = locale.getlocale(locale.LC_CTYPE)
1185 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1186 try:
1187 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1188 except locale.Error:
1189 # Unsupported locale on this system
1190 self.skipTest('test needs Turkish locale')
1191 c = codecs.lookup('ASCII')
1192 self.assertEqual(c.name, 'ascii')
1193
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001194class StreamReaderTest(unittest.TestCase):
1195
1196 def setUp(self):
1197 self.reader = codecs.getreader('utf-8')
1198 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1199
1200 def test_readlines(self):
1201 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001202 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001203
Georg Brandl8f99f812006-10-29 08:39:22 +00001204class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001205
Georg Brandl8f99f812006-10-29 08:39:22 +00001206 def test_basic(self):
1207 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001208 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001209 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001210
1211 f = StringIO.StringIO()
1212 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1213 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001214 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001215
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001216class Str2StrTest(unittest.TestCase):
1217
1218 def test_read(self):
1219 sin = "\x80".encode("base64_codec")
1220 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1221 sout = reader.read()
1222 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001223 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001224
1225 def test_readline(self):
1226 sin = "\x80".encode("base64_codec")
1227 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1228 sout = reader.readline()
1229 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001230 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001231
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001232all_unicode_encodings = [
1233 "ascii",
1234 "base64_codec",
1235 "big5",
1236 "big5hkscs",
1237 "charmap",
1238 "cp037",
1239 "cp1006",
1240 "cp1026",
1241 "cp1140",
1242 "cp1250",
1243 "cp1251",
1244 "cp1252",
1245 "cp1253",
1246 "cp1254",
1247 "cp1255",
1248 "cp1256",
1249 "cp1257",
1250 "cp1258",
1251 "cp424",
1252 "cp437",
1253 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001254 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001255 "cp737",
1256 "cp775",
1257 "cp850",
1258 "cp852",
1259 "cp855",
1260 "cp856",
1261 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001262 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001263 "cp860",
1264 "cp861",
1265 "cp862",
1266 "cp863",
1267 "cp864",
1268 "cp865",
1269 "cp866",
1270 "cp869",
1271 "cp874",
1272 "cp875",
1273 "cp932",
1274 "cp949",
1275 "cp950",
1276 "euc_jis_2004",
1277 "euc_jisx0213",
1278 "euc_jp",
1279 "euc_kr",
1280 "gb18030",
1281 "gb2312",
1282 "gbk",
1283 "hex_codec",
1284 "hp_roman8",
1285 "hz",
1286 "idna",
1287 "iso2022_jp",
1288 "iso2022_jp_1",
1289 "iso2022_jp_2",
1290 "iso2022_jp_2004",
1291 "iso2022_jp_3",
1292 "iso2022_jp_ext",
1293 "iso2022_kr",
1294 "iso8859_1",
1295 "iso8859_10",
1296 "iso8859_11",
1297 "iso8859_13",
1298 "iso8859_14",
1299 "iso8859_15",
1300 "iso8859_16",
1301 "iso8859_2",
1302 "iso8859_3",
1303 "iso8859_4",
1304 "iso8859_5",
1305 "iso8859_6",
1306 "iso8859_7",
1307 "iso8859_8",
1308 "iso8859_9",
1309 "johab",
1310 "koi8_r",
1311 "koi8_u",
1312 "latin_1",
1313 "mac_cyrillic",
1314 "mac_greek",
1315 "mac_iceland",
1316 "mac_latin2",
1317 "mac_roman",
1318 "mac_turkish",
1319 "palmos",
1320 "ptcp154",
1321 "punycode",
1322 "raw_unicode_escape",
1323 "rot_13",
1324 "shift_jis",
1325 "shift_jis_2004",
1326 "shift_jisx0213",
1327 "tis_620",
1328 "unicode_escape",
1329 "unicode_internal",
1330 "utf_16",
1331 "utf_16_be",
1332 "utf_16_le",
1333 "utf_7",
1334 "utf_8",
1335]
1336
1337if hasattr(codecs, "mbcs_encode"):
1338 all_unicode_encodings.append("mbcs")
1339
1340# The following encodings work only with str, not unicode
1341all_string_encodings = [
1342 "quopri_codec",
1343 "string_escape",
1344 "uu_codec",
1345]
1346
1347# The following encoding is not tested, because it's not supposed
1348# to work:
1349# "undefined"
1350
1351# The following encodings don't work in stateful mode
1352broken_unicode_with_streams = [
1353 "base64_codec",
1354 "hex_codec",
1355 "punycode",
1356 "unicode_internal"
1357]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001358broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001359
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001360# The following encodings only support "strict" mode
1361only_strict_mode = [
1362 "idna",
1363 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001364 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001365]
1366
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001367try:
1368 import bz2
1369except ImportError:
1370 pass
1371else:
1372 all_unicode_encodings.append("bz2_codec")
1373 broken_unicode_with_streams.append("bz2_codec")
1374
1375try:
1376 import zlib
1377except ImportError:
1378 pass
1379else:
1380 all_unicode_encodings.append("zlib_codec")
1381 broken_unicode_with_streams.append("zlib_codec")
1382
1383class BasicUnicodeTest(unittest.TestCase):
1384 def test_basics(self):
1385 s = u"abc123" # all codecs should be able to encode these
1386 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001387 name = codecs.lookup(encoding).name
1388 if encoding.endswith("_codec"):
1389 name += "_codec"
1390 elif encoding == "latin_1":
1391 name = "latin_1"
1392 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001393 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001394 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001395 (chars, size) = codecs.getdecoder(encoding)(bytes)
1396 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1397
1398 if encoding not in broken_unicode_with_streams:
1399 # check stream reader/writer
1400 q = Queue()
1401 writer = codecs.getwriter(encoding)(q)
1402 encodedresult = ""
1403 for c in s:
1404 writer.write(c)
1405 encodedresult += q.read()
1406 q = Queue()
1407 reader = codecs.getreader(encoding)(q)
1408 decodedresult = u""
1409 for c in encodedresult:
1410 q.write(c)
1411 decodedresult += reader.read()
1412 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1413
Georg Brandl2c9838e2006-10-29 14:39:09 +00001414 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001415 # check incremental decoder/encoder (fetched via the Python
1416 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001417 try:
1418 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001419 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001420 except LookupError: # no IncrementalEncoder
1421 pass
1422 else:
1423 # check incremental decoder/encoder
1424 encodedresult = ""
1425 for c in s:
1426 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001427 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001428 decoder = codecs.getincrementaldecoder(encoding)()
1429 decodedresult = u""
1430 for c in encodedresult:
1431 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001432 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001433 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1434
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001435 # check C API
1436 encodedresult = ""
1437 for c in s:
1438 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001439 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001440 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1441 decodedresult = u""
1442 for c in encodedresult:
1443 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001444 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001445 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1446
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001447 # check iterencode()/iterdecode()
1448 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1449 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1450
1451 # check iterencode()/iterdecode() with empty string
1452 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1453 self.assertEqual(result, u"")
1454
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001455 if encoding not in only_strict_mode:
1456 # check incremental decoder/encoder with errors argument
1457 try:
1458 encoder = codecs.getincrementalencoder(encoding)("ignore")
1459 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1460 except LookupError: # no IncrementalEncoder
1461 pass
1462 else:
1463 encodedresult = "".join(encoder.encode(c) for c in s)
1464 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1465 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1466 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001467
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001468 encodedresult = "".join(cencoder.encode(c) for c in s)
1469 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1470 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1471 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1472
Walter Dörwald729c31f2005-03-14 19:06:30 +00001473 def test_seek(self):
1474 # all codecs should be able to encode these
1475 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1476 for encoding in all_unicode_encodings:
1477 if encoding == "idna": # FIXME: See SF bug #1163178
1478 continue
1479 if encoding in broken_unicode_with_streams:
1480 continue
1481 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1482 for t in xrange(5):
1483 # Test that calling seek resets the internal codec state and buffers
1484 reader.seek(0, 0)
1485 line = reader.readline()
1486 self.assertEqual(s[:len(line)], line)
1487
Walter Dörwalde22d3392005-11-17 08:52:34 +00001488 def test_bad_decode_args(self):
1489 for encoding in all_unicode_encodings:
1490 decoder = codecs.getdecoder(encoding)
1491 self.assertRaises(TypeError, decoder)
1492 if encoding not in ("idna", "punycode"):
1493 self.assertRaises(TypeError, decoder, 42)
1494
1495 def test_bad_encode_args(self):
1496 for encoding in all_unicode_encodings:
1497 encoder = codecs.getencoder(encoding)
1498 self.assertRaises(TypeError, encoder)
1499
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001500 def test_encoding_map_type_initialized(self):
1501 from encodings import cp1140
1502 # This used to crash, we are only verifying there's no crash.
1503 table_type = type(cp1140.encoding_table)
1504 self.assertEqual(table_type, table_type)
1505
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001506class BasicStrTest(unittest.TestCase):
1507 def test_basics(self):
1508 s = "abc123"
1509 for encoding in all_string_encodings:
1510 (bytes, size) = codecs.getencoder(encoding)(s)
1511 self.assertEqual(size, len(s))
1512 (chars, size) = codecs.getdecoder(encoding)(bytes)
1513 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1514
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001515class CharmapTest(unittest.TestCase):
1516 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001517 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001518 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1519 (u"abc", 3)
1520 )
1521
Ezio Melotti2623a372010-11-21 13:34:58 +00001522 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001523 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1524 (u"ab\ufffd", 3)
1525 )
1526
Ezio Melotti2623a372010-11-21 13:34:58 +00001527 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001528 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1529 (u"ab\ufffd", 3)
1530 )
1531
Ezio Melotti2623a372010-11-21 13:34:58 +00001532 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001533 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1534 (u"ab", 3)
1535 )
1536
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001537 self.assertRaises(UnicodeDecodeError,
1538 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1539 )
1540
Ezio Melotti2623a372010-11-21 13:34:58 +00001541 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001542 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1543 (u"ab", 3)
1544 )
1545
1546 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001547 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001548 codecs.charmap_decode(allbytes, "ignore", u""),
1549 (u"", len(allbytes))
1550 )
1551
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001552 def test_decode_with_int2str_map(self):
1553 self.assertEqual(
1554 codecs.charmap_decode("\x00\x01\x02", "strict",
1555 {0: u'a', 1: u'b', 2: u'c'}),
1556 (u"abc", 3)
1557 )
1558
1559 self.assertEqual(
1560 codecs.charmap_decode("\x00\x01\x02", "strict",
1561 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1562 (u"AaBbCc", 3)
1563 )
1564
1565 self.assertEqual(
1566 codecs.charmap_decode("\x00\x01\x02", "strict",
1567 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1568 (u"\U0010FFFFbc", 3)
1569 )
1570
1571 self.assertEqual(
1572 codecs.charmap_decode("\x00\x01\x02", "strict",
1573 {0: u'a', 1: u'b', 2: u''}),
1574 (u"ab", 3)
1575 )
1576
1577 self.assertRaises(UnicodeDecodeError,
1578 codecs.charmap_decode, "\x00\x01\x02", "strict",
1579 {0: u'a', 1: u'b'}
1580 )
1581
1582 self.assertEqual(
1583 codecs.charmap_decode("\x00\x01\x02", "replace",
1584 {0: u'a', 1: u'b'}),
1585 (u"ab\ufffd", 3)
1586 )
1587
1588 self.assertEqual(
1589 codecs.charmap_decode("\x00\x01\x02", "replace",
1590 {0: u'a', 1: u'b', 2: None}),
1591 (u"ab\ufffd", 3)
1592 )
1593
1594 self.assertEqual(
1595 codecs.charmap_decode("\x00\x01\x02", "ignore",
1596 {0: u'a', 1: u'b'}),
1597 (u"ab", 3)
1598 )
1599
1600 self.assertEqual(
1601 codecs.charmap_decode("\x00\x01\x02", "ignore",
1602 {0: u'a', 1: u'b', 2: None}),
1603 (u"ab", 3)
1604 )
1605
1606 allbytes = bytes(range(256))
1607 self.assertEqual(
1608 codecs.charmap_decode(allbytes, "ignore", {}),
1609 (u"", len(allbytes))
1610 )
1611
1612 def test_decode_with_int2int_map(self):
1613 a = ord(u'a')
1614 b = ord(u'b')
1615 c = ord(u'c')
1616
1617 self.assertEqual(
1618 codecs.charmap_decode("\x00\x01\x02", "strict",
1619 {0: a, 1: b, 2: c}),
1620 (u"abc", 3)
1621 )
1622
1623 # Issue #15379
1624 self.assertEqual(
1625 codecs.charmap_decode("\x00\x01\x02", "strict",
1626 {0: 0x10FFFF, 1: b, 2: c}),
1627 (u"\U0010FFFFbc", 3)
1628 )
1629
1630 self.assertRaises(TypeError,
1631 codecs.charmap_decode, "\x00\x01\x02", "strict",
1632 {0: 0x110000, 1: b, 2: c}
1633 )
1634
1635 self.assertRaises(UnicodeDecodeError,
1636 codecs.charmap_decode, "\x00\x01\x02", "strict",
1637 {0: a, 1: b},
1638 )
1639
1640 self.assertEqual(
1641 codecs.charmap_decode("\x00\x01\x02", "replace",
1642 {0: a, 1: b}),
1643 (u"ab\ufffd", 3)
1644 )
1645
1646 self.assertEqual(
1647 codecs.charmap_decode("\x00\x01\x02", "ignore",
1648 {0: a, 1: b}),
1649 (u"ab", 3)
1650 )
1651
1652
Georg Brandl8f99f812006-10-29 08:39:22 +00001653class WithStmtTest(unittest.TestCase):
1654 def test_encodedfile(self):
1655 f = StringIO.StringIO("\xc3\xbc")
1656 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001657 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001658
1659 def test_streamreaderwriter(self):
1660 f = StringIO.StringIO("\xc3\xbc")
1661 info = codecs.lookup("utf-8")
1662 with codecs.StreamReaderWriter(f, info.streamreader,
1663 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001664 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001665
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001666
Victor Stinner262be5e2010-05-22 02:11:07 +00001667class BomTest(unittest.TestCase):
1668 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001669 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001670 tests = ("utf-16",
1671 "utf-16-le",
1672 "utf-16-be",
1673 "utf-32",
1674 "utf-32-le",
1675 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001676 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001677 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001678 # Check if the BOM is written only once
1679 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001680 f.write(data)
1681 f.write(data)
1682 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001683 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001684 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001685 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001686
Victor Stinner7df55da2010-05-22 13:37:56 +00001687 # Check that the BOM is written after a seek(0)
1688 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1689 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001690 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001691 f.seek(0)
1692 f.write(data)
1693 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001694 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001695
1696 # (StreamWriter) Check that the BOM is written after a seek(0)
1697 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1698 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001699 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001700 f.writer.seek(0)
1701 f.writer.write(data)
1702 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001703 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001704
1705 # Check that the BOM is not written after a seek() at a position
1706 # different than the start
1707 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1708 f.write(data)
1709 f.seek(f.tell())
1710 f.write(data)
1711 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001712 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001713
1714 # (StreamWriter) Check that the BOM is not written after a seek()
1715 # at a position different than the start
1716 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1717 f.writer.write(data)
1718 f.writer.seek(f.writer.tell())
1719 f.writer.write(data)
1720 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001721 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001722
Victor Stinner262be5e2010-05-22 02:11:07 +00001723
Fred Drake2e2be372001-09-20 21:33:42 +00001724def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001725 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001726 UTF32Test,
1727 UTF32LETest,
1728 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001729 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001730 UTF16LETest,
1731 UTF16BETest,
1732 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001733 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001734 UTF7Test,
1735 UTF16ExTest,
1736 ReadBufferTest,
1737 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001738 EscapeDecodeTest,
1739 RecodingTest,
1740 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001741 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001742 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001743 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001744 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001745 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001746 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001747 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001748 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001749 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001750 CharmapTest,
1751 WithStmtTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001752 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001753 )
Fred Drake2e2be372001-09-20 21:33:42 +00001754
1755
1756if __name__ == "__main__":
1757 test_main()