blob: 0ec8bf5a4b4f2689d6fca609caad5c456b25b544 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Serhiy Storchaka9a118f12016-04-17 09:37:36 +030050 # do the check again, this time using an incremental decoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +000051 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
Miss Islington (bot)fc73c542017-11-28 16:15:43 -0800152 # Issue #32110: Test readline() followed by read(n)
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.read(1), lines[1][0])
156 self.assertEqual(f.read(0), '')
157 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
158
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200159 # Issue #16636: Test readline() followed by readlines()
160 f = getreader()
161 self.assertEqual(f.readline(), lines[0])
162 self.assertEqual(f.readlines(), lines[1:])
163 self.assertEqual(f.read(), '')
164
Miss Islington (bot)fc73c542017-11-28 16:15:43 -0800165 # Test read(n) followed by read()
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200166 f = getreader()
167 self.assertEqual(f.read(size=40, chars=5), data[:5])
168 self.assertEqual(f.read(), data[5:])
169 self.assertEqual(f.read(), '')
170
Miss Islington (bot)fc73c542017-11-28 16:15:43 -0800171 # Issue #32110: Test read(n) followed by read(n)
172 f = getreader()
173 self.assertEqual(f.read(size=40, chars=5), data[:5])
174 self.assertEqual(f.read(1), data[5])
175 self.assertEqual(f.read(0), '')
176 self.assertEqual(f.read(100), data[6:106])
177
178 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200179 f = getreader()
180 self.assertEqual(f.read(size=40, chars=5), data[:5])
181 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
182 self.assertEqual(f.read(), '')
183
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000184 def test_bug1175396(self):
185 s = [
186 '<%!--===================================================\r\n',
187 ' BLOG index page: show recent articles,\r\n',
188 ' today\'s articles, or articles of a specific date.\r\n',
189 '========================================================--%>\r\n',
190 '<%@inputencoding="ISO-8859-1"%>\r\n',
191 '<%@pagetemplate=TEMPLATE.y%>\r\n',
192 '<%@import=import frog.util, frog%>\r\n',
193 '<%@import=import frog.objects%>\r\n',
194 '<%@import=from frog.storageerrors import StorageError%>\r\n',
195 '<%\r\n',
196 '\r\n',
197 'import logging\r\n',
198 'log=logging.getLogger("Snakelets.logger")\r\n',
199 '\r\n',
200 '\r\n',
201 'user=self.SessionCtx.user\r\n',
202 'storageEngine=self.SessionCtx.storageEngine\r\n',
203 '\r\n',
204 '\r\n',
205 'def readArticlesFromDate(date, count=None):\r\n',
206 ' entryids=storageEngine.listBlogEntries(date)\r\n',
207 ' entryids.reverse() # descending\r\n',
208 ' if count:\r\n',
209 ' entryids=entryids[:count]\r\n',
210 ' try:\r\n',
211 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
212 ' except StorageError,x:\r\n',
213 ' log.error("Error loading articles: "+str(x))\r\n',
214 ' self.abort("cannot load articles")\r\n',
215 '\r\n',
216 'showdate=None\r\n',
217 '\r\n',
218 'arg=self.Request.getArg()\r\n',
219 'if arg=="today":\r\n',
220 ' #-------------------- TODAY\'S ARTICLES\r\n',
221 ' self.write("<h2>Today\'s articles</h2>")\r\n',
222 ' showdate = frog.util.isodatestr() \r\n',
223 ' entries = readArticlesFromDate(showdate)\r\n',
224 'elif arg=="active":\r\n',
225 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
226 ' self.Yredirect("active.y")\r\n',
227 'elif arg=="login":\r\n',
228 ' #-------------------- LOGIN PAGE redirect\r\n',
229 ' self.Yredirect("login.y")\r\n',
230 'elif arg=="date":\r\n',
231 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
232 ' showdate = self.Request.getParameter("date")\r\n',
233 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
234 ' entries = readArticlesFromDate(showdate)\r\n',
235 'else:\r\n',
236 ' #-------------------- RECENT ARTICLES\r\n',
237 ' self.write("<h2>Recent articles</h2>")\r\n',
238 ' dates=storageEngine.listBlogEntryDates()\r\n',
239 ' if dates:\r\n',
240 ' entries=[]\r\n',
241 ' SHOWAMOUNT=10\r\n',
242 ' for showdate in dates:\r\n',
243 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
244 ' if len(entries)>=SHOWAMOUNT:\r\n',
245 ' break\r\n',
246 ' \r\n',
247 ]
248 stream = StringIO.StringIO("".join(s).encode(self.encoding))
249 reader = codecs.getreader(self.encoding)(stream)
250 for (i, line) in enumerate(reader):
251 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000252
253 def test_readlinequeue(self):
254 q = Queue()
255 writer = codecs.getwriter(self.encoding)(q)
256 reader = codecs.getreader(self.encoding)(q)
257
258 # No lineends
259 writer.write(u"foo\r")
260 self.assertEqual(reader.readline(keepends=False), u"foo")
261 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000262 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000263 self.assertEqual(reader.readline(keepends=False), u"bar")
264 writer.write(u"baz")
265 self.assertEqual(reader.readline(keepends=False), u"baz")
266 self.assertEqual(reader.readline(keepends=False), u"")
267
268 # Lineends
269 writer.write(u"foo\r")
270 self.assertEqual(reader.readline(keepends=True), u"foo\r")
271 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000272 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000273 self.assertEqual(reader.readline(keepends=True), u"bar\r")
274 writer.write(u"baz")
275 self.assertEqual(reader.readline(keepends=True), u"baz")
276 self.assertEqual(reader.readline(keepends=True), u"")
277 writer.write(u"foo\r\n")
278 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
279
Walter Dörwald9fa09462005-01-10 12:01:39 +0000280 def test_bug1098990_a(self):
281 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
282 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
283 s3 = u"next line.\r\n"
284
285 s = (s1+s2+s3).encode(self.encoding)
286 stream = StringIO.StringIO(s)
287 reader = codecs.getreader(self.encoding)(stream)
288 self.assertEqual(reader.readline(), s1)
289 self.assertEqual(reader.readline(), s2)
290 self.assertEqual(reader.readline(), s3)
291 self.assertEqual(reader.readline(), u"")
292
293 def test_bug1098990_b(self):
294 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
295 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
296 s3 = u"stillokay:bbbbxx\r\n"
297 s4 = u"broken!!!!badbad\r\n"
298 s5 = u"againokay.\r\n"
299
300 s = (s1+s2+s3+s4+s5).encode(self.encoding)
301 stream = StringIO.StringIO(s)
302 reader = codecs.getreader(self.encoding)(stream)
303 self.assertEqual(reader.readline(), s1)
304 self.assertEqual(reader.readline(), s2)
305 self.assertEqual(reader.readline(), s3)
306 self.assertEqual(reader.readline(), s4)
307 self.assertEqual(reader.readline(), s5)
308 self.assertEqual(reader.readline(), u"")
309
Walter Dörwald6e390802007-08-17 16:41:28 +0000310class UTF32Test(ReadTest):
311 encoding = "utf-32"
312
313 spamle = ('\xff\xfe\x00\x00'
314 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
315 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
316 spambe = ('\x00\x00\xfe\xff'
317 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
318 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
319
320 def test_only_one_bom(self):
321 _,_,reader,writer = codecs.lookup(self.encoding)
322 # encode some stream
323 s = StringIO.StringIO()
324 f = writer(s)
325 f.write(u"spam")
326 f.write(u"spam")
327 d = s.getvalue()
328 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000329 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000330 # try to read it back
331 s = StringIO.StringIO(d)
332 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000333 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000334
335 def test_badbom(self):
336 s = StringIO.StringIO(4*"\xff")
337 f = codecs.getreader(self.encoding)(s)
338 self.assertRaises(UnicodeError, f.read)
339
340 s = StringIO.StringIO(8*"\xff")
341 f = codecs.getreader(self.encoding)(s)
342 self.assertRaises(UnicodeError, f.read)
343
344 def test_partial(self):
345 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200346 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000347 [
348 u"", # first byte of BOM read
349 u"", # second byte of BOM read
350 u"", # third byte of BOM read
351 u"", # fourth byte of BOM read => byteorder known
352 u"",
353 u"",
354 u"",
355 u"\x00",
356 u"\x00",
357 u"\x00",
358 u"\x00",
359 u"\x00\xff",
360 u"\x00\xff",
361 u"\x00\xff",
362 u"\x00\xff",
363 u"\x00\xff\u0100",
364 u"\x00\xff\u0100",
365 u"\x00\xff\u0100",
366 u"\x00\xff\u0100",
367 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200368 u"\x00\xff\u0100\uffff",
369 u"\x00\xff\u0100\uffff",
370 u"\x00\xff\u0100\uffff",
371 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000372 ]
373 )
374
Georg Brandle9741f32009-09-17 11:28:09 +0000375 def test_handlers(self):
376 self.assertEqual((u'\ufffd', 1),
377 codecs.utf_32_decode('\x01', 'replace', True))
378 self.assertEqual((u'', 1),
379 codecs.utf_32_decode('\x01', 'ignore', True))
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381 def test_errors(self):
382 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
383 "\xff", "strict", True)
384
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000385 def test_issue8941(self):
386 # Issue #8941: insufficient result allocation when decoding into
387 # surrogate pairs on UCS-2 builds.
388 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
389 self.assertEqual(u'\U00010000' * 1024,
390 codecs.utf_32_decode(encoded_le)[0])
391 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
392 self.assertEqual(u'\U00010000' * 1024,
393 codecs.utf_32_decode(encoded_be)[0])
394
Walter Dörwald6e390802007-08-17 16:41:28 +0000395class UTF32LETest(ReadTest):
396 encoding = "utf-32-le"
397
398 def test_partial(self):
399 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200400 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000401 [
402 u"",
403 u"",
404 u"",
405 u"\x00",
406 u"\x00",
407 u"\x00",
408 u"\x00",
409 u"\x00\xff",
410 u"\x00\xff",
411 u"\x00\xff",
412 u"\x00\xff",
413 u"\x00\xff\u0100",
414 u"\x00\xff\u0100",
415 u"\x00\xff\u0100",
416 u"\x00\xff\u0100",
417 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200418 u"\x00\xff\u0100\uffff",
419 u"\x00\xff\u0100\uffff",
420 u"\x00\xff\u0100\uffff",
421 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000422 ]
423 )
424
425 def test_simple(self):
426 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
427
428 def test_errors(self):
429 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
430 "\xff", "strict", True)
431
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000432 def test_issue8941(self):
433 # Issue #8941: insufficient result allocation when decoding into
434 # surrogate pairs on UCS-2 builds.
435 encoded = '\x00\x00\x01\x00' * 1024
436 self.assertEqual(u'\U00010000' * 1024,
437 codecs.utf_32_le_decode(encoded)[0])
438
Walter Dörwald6e390802007-08-17 16:41:28 +0000439class UTF32BETest(ReadTest):
440 encoding = "utf-32-be"
441
442 def test_partial(self):
443 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200444 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000445 [
446 u"",
447 u"",
448 u"",
449 u"\x00",
450 u"\x00",
451 u"\x00",
452 u"\x00",
453 u"\x00\xff",
454 u"\x00\xff",
455 u"\x00\xff",
456 u"\x00\xff",
457 u"\x00\xff\u0100",
458 u"\x00\xff\u0100",
459 u"\x00\xff\u0100",
460 u"\x00\xff\u0100",
461 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200462 u"\x00\xff\u0100\uffff",
463 u"\x00\xff\u0100\uffff",
464 u"\x00\xff\u0100\uffff",
465 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000466 ]
467 )
468
469 def test_simple(self):
470 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
471
472 def test_errors(self):
473 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
474 "\xff", "strict", True)
475
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000476 def test_issue8941(self):
477 # Issue #8941: insufficient result allocation when decoding into
478 # surrogate pairs on UCS-2 builds.
479 encoded = '\x00\x01\x00\x00' * 1024
480 self.assertEqual(u'\U00010000' * 1024,
481 codecs.utf_32_be_decode(encoded)[0])
482
483
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484class UTF16Test(ReadTest):
485 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486
487 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
488 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
489
490 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000491 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000492 # encode some stream
493 s = StringIO.StringIO()
494 f = writer(s)
495 f.write(u"spam")
496 f.write(u"spam")
497 d = s.getvalue()
498 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000499 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000500 # try to read it back
501 s = StringIO.StringIO(d)
502 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000503 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000504
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000505 def test_badbom(self):
506 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000507 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000508 self.assertRaises(UnicodeError, f.read)
509
510 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000511 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000512 self.assertRaises(UnicodeError, f.read)
513
Walter Dörwald69652032004-09-07 20:24:22 +0000514 def test_partial(self):
515 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200516 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000517 [
518 u"", # first byte of BOM read
519 u"", # second byte of BOM read => byteorder known
520 u"",
521 u"\x00",
522 u"\x00",
523 u"\x00\xff",
524 u"\x00\xff",
525 u"\x00\xff\u0100",
526 u"\x00\xff\u0100",
527 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200528 u"\x00\xff\u0100\uffff",
529 u"\x00\xff\u0100\uffff",
530 u"\x00\xff\u0100\uffff",
531 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000532 ]
533 )
534
Georg Brandle9741f32009-09-17 11:28:09 +0000535 def test_handlers(self):
536 self.assertEqual((u'\ufffd', 1),
537 codecs.utf_16_decode('\x01', 'replace', True))
538 self.assertEqual((u'', 1),
539 codecs.utf_16_decode('\x01', 'ignore', True))
540
Walter Dörwalde22d3392005-11-17 08:52:34 +0000541 def test_errors(self):
542 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
543
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000544 def test_bug691291(self):
545 # Files are always opened in binary mode, even if no binary mode was
546 # specified. This means that no automatic conversion of '\n' is done
547 # on reading and writing.
548 s1 = u'Hello\r\nworld\r\n'
549
550 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200551 self.addCleanup(test_support.unlink, test_support.TESTFN)
552 with open(test_support.TESTFN, 'wb') as fp:
553 fp.write(s)
554 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
555 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000556
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000557class UTF16LETest(ReadTest):
558 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000559
560 def test_partial(self):
561 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200562 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 [
564 u"",
565 u"\x00",
566 u"\x00",
567 u"\x00\xff",
568 u"\x00\xff",
569 u"\x00\xff\u0100",
570 u"\x00\xff\u0100",
571 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200572 u"\x00\xff\u0100\uffff",
573 u"\x00\xff\u0100\uffff",
574 u"\x00\xff\u0100\uffff",
575 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200580 tests = [
581 (b'\xff', u'\ufffd'),
582 (b'A\x00Z', u'A\ufffd'),
583 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
584 (b'\x00\xd8', u'\ufffd'),
585 (b'\x00\xd8A', u'\ufffd'),
586 (b'\x00\xd8A\x00', u'\ufffdA'),
587 (b'\x00\xdcA\x00', u'\ufffdA'),
588 ]
589 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300590 try:
591 with self.assertRaises(UnicodeDecodeError):
592 codecs.utf_16_le_decode(raw, 'strict', True)
593 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
594 except:
595 print 'raw=%r' % raw
596 raise
Walter Dörwalde22d3392005-11-17 08:52:34 +0000597
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000598class UTF16BETest(ReadTest):
599 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000600
601 def test_partial(self):
602 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200603 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000604 [
605 u"",
606 u"\x00",
607 u"\x00",
608 u"\x00\xff",
609 u"\x00\xff",
610 u"\x00\xff\u0100",
611 u"\x00\xff\u0100",
612 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200613 u"\x00\xff\u0100\uffff",
614 u"\x00\xff\u0100\uffff",
615 u"\x00\xff\u0100\uffff",
616 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000617 ]
618 )
619
Walter Dörwalde22d3392005-11-17 08:52:34 +0000620 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200621 tests = [
622 (b'\xff', u'\ufffd'),
623 (b'\x00A\xff', u'A\ufffd'),
624 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
625 (b'\xd8\x00', u'\ufffd'),
626 (b'\xd8\x00\xdc', u'\ufffd'),
627 (b'\xd8\x00\x00A', u'\ufffdA'),
628 (b'\xdc\x00\x00A', u'\ufffdA'),
629 ]
630 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300631 try:
632 with self.assertRaises(UnicodeDecodeError):
633 codecs.utf_16_be_decode(raw, 'strict', True)
634 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
635 except:
636 print 'raw=%r' % raw
637 raise
Walter Dörwalde22d3392005-11-17 08:52:34 +0000638
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000639class UTF8Test(ReadTest):
640 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000641
642 def test_partial(self):
643 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200644 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000645 [
646 u"\x00",
647 u"\x00",
648 u"\x00\xff",
649 u"\x00\xff",
650 u"\x00\xff\u07ff",
651 u"\x00\xff\u07ff",
652 u"\x00\xff\u07ff",
653 u"\x00\xff\u07ff\u0800",
654 u"\x00\xff\u07ff\u0800",
655 u"\x00\xff\u07ff\u0800",
656 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200657 u"\x00\xff\u07ff\u0800\uffff",
658 u"\x00\xff\u07ff\u0800\uffff",
659 u"\x00\xff\u07ff\u0800\uffff",
660 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000661 ]
662 )
663
Walter Dörwalde22d3392005-11-17 08:52:34 +0000664class UTF7Test(ReadTest):
665 encoding = "utf-7"
666
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300667 def test_ascii(self):
668 # Set D (directly encoded characters)
669 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
670 'abcdefghijklmnopqrstuvwxyz'
671 '0123456789'
672 '\'(),-./:?')
673 self.assertEqual(set_d.encode(self.encoding), set_d)
674 self.assertEqual(set_d.decode(self.encoding), set_d)
675 # Set O (optional direct characters)
676 set_o = ' !"#$%&*;<=>@[]^_`{|}'
677 self.assertEqual(set_o.encode(self.encoding), set_o)
678 self.assertEqual(set_o.decode(self.encoding), set_o)
679 # +
680 self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
681 self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
682 # White spaces
683 ws = ' \t\n\r'
684 self.assertEqual(ws.encode(self.encoding), ws)
685 self.assertEqual(ws.decode(self.encoding), ws)
686 # Other ASCII characters
687 other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
688 set(set_d + set_o + '+' + ws)))
689 self.assertEqual(other_ascii.encode(self.encoding),
690 '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
691 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
692
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000693 def test_partial(self):
694 self.check_partial(
695 u"a+-b",
696 [
697 u"a",
698 u"a",
699 u"a+",
700 u"a+-",
701 u"a+-b",
702 ]
703 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300705 def test_errors(self):
706 tests = [
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300707 ('\xe1b', u'\ufffdb'),
708 ('a\xe1b', u'a\ufffdb'),
709 ('a\xe1\xe1b', u'a\ufffd\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300710 ('a+IK', u'a\ufffd'),
711 ('a+IK-b', u'a\ufffdb'),
712 ('a+IK,b', u'a\ufffdb'),
713 ('a+IKx', u'a\u20ac\ufffd'),
714 ('a+IKx-b', u'a\u20ac\ufffdb'),
715 ('a+IKwgr', u'a\u20ac\ufffd'),
716 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
717 ('a+IKwgr,', u'a\u20ac\ufffd'),
718 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
719 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
720 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
721 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
722 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
723 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
724 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300725 ('a+IKw-b\xe1', u'a\u20acb\ufffd'),
726 ('a+IKw\xe1b', u'a\u20ac\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300727 ]
728 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300729 try:
730 with self.assertRaises(UnicodeDecodeError):
731 codecs.utf_7_decode(raw, 'strict', True)
732 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
733 except:
734 print 'raw=%r' % raw
735 raise
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300736
737 def test_nonbmp(self):
738 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
739 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
740 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300741 self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
742 self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
743 self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
744 self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
745 self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
746 '+IKwgrNgB3KA-')
747 self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
748 u'\u20ac\u20ac\U000104A0')
749 self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
750 u'\u20ac\u20ac\U000104A0')
751
752 def test_lone_surrogates(self):
753 tests = [
754 ('a+2AE-b', u'a\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300755 ('a+2AE\xe1b', u'a\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300756 ('a+2AE', u'a\ufffd'),
757 ('a+2AEA-b', u'a\ufffdb'),
758 ('a+2AH-b', u'a\ufffdb'),
759 ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300760 ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300761 ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
762 ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
763 ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300764 ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300765 ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
766 ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
767 ]
768 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300769 try:
770 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
771 except:
772 print 'raw=%r' % raw
773 raise
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300774
Walter Dörwalde22d3392005-11-17 08:52:34 +0000775class UTF16ExTest(unittest.TestCase):
776
777 def test_errors(self):
778 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
779
780 def test_bad_args(self):
781 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
782
783class ReadBufferTest(unittest.TestCase):
784
785 def test_array(self):
786 import array
787 self.assertEqual(
788 codecs.readbuffer_encode(array.array("c", "spam")),
789 ("spam", 4)
790 )
791
792 def test_empty(self):
793 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
794
795 def test_bad_args(self):
796 self.assertRaises(TypeError, codecs.readbuffer_encode)
797 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
798
799class CharBufferTest(unittest.TestCase):
800
801 def test_string(self):
802 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
803
804 def test_empty(self):
805 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
806
807 def test_bad_args(self):
808 self.assertRaises(TypeError, codecs.charbuffer_encode)
809 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
810
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000811class UTF8SigTest(ReadTest):
812 encoding = "utf-8-sig"
813
814 def test_partial(self):
815 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200816 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000817 [
818 u"",
819 u"",
820 u"", # First BOM has been read and skipped
821 u"",
822 u"",
823 u"\ufeff", # Second BOM has been read and emitted
824 u"\ufeff\x00", # "\x00" read and emitted
825 u"\ufeff\x00", # First byte of encoded u"\xff" read
826 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
827 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
828 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
829 u"\ufeff\x00\xff\u07ff",
830 u"\ufeff\x00\xff\u07ff",
831 u"\ufeff\x00\xff\u07ff\u0800",
832 u"\ufeff\x00\xff\u07ff\u0800",
833 u"\ufeff\x00\xff\u07ff\u0800",
834 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200835 u"\ufeff\x00\xff\u07ff\u0800\uffff",
836 u"\ufeff\x00\xff\u07ff\u0800\uffff",
837 u"\ufeff\x00\xff\u07ff\u0800\uffff",
838 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000839 ]
840 )
841
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000842 def test_bug1601501(self):
843 # SF bug #1601501: check that the codec works with a buffer
844 unicode("\xef\xbb\xbf", "utf-8-sig")
845
Walter Dörwald42348272007-04-12 10:35:00 +0000846 def test_bom(self):
847 d = codecs.getincrementaldecoder("utf-8-sig")()
848 s = u"spam"
849 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
850
Walter Dörwald183744d2007-11-19 12:41:10 +0000851 def test_stream_bom(self):
852 unistring = u"ABC\u00A1\u2200XYZ"
853 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
854
855 reader = codecs.getreader("utf-8-sig")
856 for sizehint in [None] + range(1, 11) + \
857 [64, 128, 256, 512, 1024]:
858 istream = reader(StringIO.StringIO(bytestring))
859 ostream = StringIO.StringIO()
860 while 1:
861 if sizehint is not None:
862 data = istream.read(sizehint)
863 else:
864 data = istream.read()
865
866 if not data:
867 break
868 ostream.write(data)
869
870 got = ostream.getvalue()
871 self.assertEqual(got, unistring)
872
873 def test_stream_bare(self):
874 unistring = u"ABC\u00A1\u2200XYZ"
875 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
876
877 reader = codecs.getreader("utf-8-sig")
878 for sizehint in [None] + range(1, 11) + \
879 [64, 128, 256, 512, 1024]:
880 istream = reader(StringIO.StringIO(bytestring))
881 ostream = StringIO.StringIO()
882 while 1:
883 if sizehint is not None:
884 data = istream.read(sizehint)
885 else:
886 data = istream.read()
887
888 if not data:
889 break
890 ostream.write(data)
891
892 got = ostream.getvalue()
893 self.assertEqual(got, unistring)
894
Walter Dörwald8709a422002-09-03 13:53:40 +0000895class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000896 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000897 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000898
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200899 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200900 decode = codecs.escape_decode
901 for b in range(256):
902 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200903 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200904 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200905
906 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200907 decode = codecs.escape_decode
908 check = coding_checker(self, decode)
909 check(b"[\\\n]", b"[]")
910 check(br'[\"]', b'["]')
911 check(br"[\']", b"[']")
912 check(br"[\\]", br"[\]")
913 check(br"[\a]", b"[\x07]")
914 check(br"[\b]", b"[\x08]")
915 check(br"[\t]", b"[\x09]")
916 check(br"[\n]", b"[\x0a]")
917 check(br"[\v]", b"[\x0b]")
918 check(br"[\f]", b"[\x0c]")
919 check(br"[\r]", b"[\x0d]")
920 check(br"[\7]", b"[\x07]")
921 check(br"[\8]", br"[\8]")
922 check(br"[\78]", b"[\x078]")
923 check(br"[\41]", b"[!]")
924 check(br"[\418]", b"[!8]")
925 check(br"[\101]", b"[A]")
926 check(br"[\1010]", b"[A0]")
927 check(br"[\501]", b"[A]")
928 check(br"[\x41]", b"[A]")
929 check(br"[\X41]", br"[\X41]")
930 check(br"[\x410]", b"[A0]")
931 for b in range(256):
932 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200933 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200934 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200935
936 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200937 decode = codecs.escape_decode
938 self.assertRaises(ValueError, decode, br"\x")
939 self.assertRaises(ValueError, decode, br"[\x]")
940 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
941 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
942 self.assertRaises(ValueError, decode, br"\x0")
943 self.assertRaises(ValueError, decode, br"[\x0]")
944 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
945 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200946
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000947class RecodingTest(unittest.TestCase):
948 def test_recoding(self):
949 f = StringIO.StringIO()
950 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
951 f2.write(u"a")
952 f2.close()
953 # Python used to crash on this at exit because of a refcount
954 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000955
Martin v. Löwis2548c732003-04-18 10:39:54 +0000956# From RFC 3492
957punycode_testcases = [
958 # A Arabic (Egyptian):
959 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
960 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
961 "egbpdaj6bu4bxfgehfvwxn"),
962 # B Chinese (simplified):
963 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
964 "ihqwcrb4cv8a8dqg056pqjye"),
965 # C Chinese (traditional):
966 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
967 "ihqwctvzc91f659drss3x8bo0yb"),
968 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
969 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
970 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
971 u"\u0065\u0073\u006B\u0079",
972 "Proprostnemluvesky-uyb24dma41a"),
973 # E Hebrew:
974 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
975 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
976 u"\u05D1\u05E8\u05D9\u05EA",
977 "4dbcagdahymbxekheh6e0a7fei0b"),
978 # F Hindi (Devanagari):
979 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
980 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
981 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
982 u"\u0939\u0948\u0902",
983 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
984
985 #(G) Japanese (kanji and hiragana):
986 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
987 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
988 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
989
990 # (H) Korean (Hangul syllables):
991 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
992 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
993 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
994 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
995 "psd879ccm6fea98c"),
996
997 # (I) Russian (Cyrillic):
998 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
999 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1000 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1001 u"\u0438",
1002 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1003
1004 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1005 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1006 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1007 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1008 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1009 u"\u0061\u00F1\u006F\u006C",
1010 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
1011
1012 # (K) Vietnamese:
1013 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1014 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1015 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1016 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1017 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1018 u"\u0056\u0069\u1EC7\u0074",
1019 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1020
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 #(L) 3<nen>B<gumi><kinpachi><sensei>
1022 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1023 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001024
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025 # (M) <amuro><namie>-with-SUPER-MONKEYS
1026 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1027 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1028 u"\u004F\u004E\u004B\u0045\u0059\u0053",
1029 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1030
1031 # (N) Hello-Another-Way-<sorezore><no><basho>
1032 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1033 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1034 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1035 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1036
1037 # (O) <hitotsu><yane><no><shita>2
1038 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1039 "2-u9tlzr9756bt3uc0v"),
1040
1041 # (P) Maji<de>Koi<suru>5<byou><mae>
1042 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1043 u"\u308B\u0035\u79D2\u524D",
1044 "MajiKoi5-783gue6qz075azm5e"),
1045
1046 # (Q) <pafii>de<runba>
1047 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1048 "de-jg4avhby1noc0d"),
1049
1050 # (R) <sono><supiido><de>
1051 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1052 "d9juau41awczczp"),
1053
1054 # (S) -> $1.00 <-
1055 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1056 u"\u003C\u002D",
1057 "-> $1.00 <--")
1058 ]
1059
1060for i in punycode_testcases:
1061 if len(i)!=2:
1062 print repr(i)
1063
1064class PunycodeTest(unittest.TestCase):
1065 def test_encode(self):
1066 for uni, puny in punycode_testcases:
1067 # Need to convert both strings to lower case, since
1068 # some of the extended encodings use upper case, but our
1069 # code produces only lower case. Converting just puny to
1070 # lower is also insufficient, since some of the input characters
1071 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +00001072 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073
1074 def test_decode(self):
1075 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +00001076 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001078class UnicodeInternalTest(unittest.TestCase):
1079 def test_bug1251300(self):
1080 # Decoding with unicode_internal used to not correctly handle "code
1081 # points" above 0x10ffff on UCS-4 builds.
1082 if sys.maxunicode > 0xffff:
1083 ok = [
1084 ("\x00\x10\xff\xff", u"\U0010ffff"),
1085 ("\x00\x00\x01\x01", u"\U00000101"),
1086 ("", u""),
1087 ]
1088 not_ok = [
1089 "\x7f\xff\xff\xff",
1090 "\x80\x00\x00\x00",
1091 "\x81\x00\x00\x00",
1092 "\x00",
1093 "\x00\x00\x00\x00\x00",
1094 ]
1095 for internal, uni in ok:
1096 if sys.byteorder == "little":
1097 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001098 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001099 for internal in not_ok:
1100 if sys.byteorder == "little":
1101 internal = "".join(reversed(internal))
1102 self.assertRaises(UnicodeDecodeError, internal.decode,
1103 "unicode_internal")
1104
1105 def test_decode_error_attributes(self):
1106 if sys.maxunicode > 0xffff:
1107 try:
1108 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1109 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001110 self.assertEqual("unicode_internal", ex.encoding)
1111 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1112 self.assertEqual(4, ex.start)
1113 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001114 else:
1115 self.fail()
1116
1117 def test_decode_callback(self):
1118 if sys.maxunicode > 0xffff:
1119 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1120 decoder = codecs.getdecoder("unicode_internal")
1121 ab = u"ab".encode("unicode_internal")
1122 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1123 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001124 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001125
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001126 def test_encode_length(self):
1127 # Issue 3739
1128 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001129 self.assertEqual(encoder(u"a")[1], 1)
1130 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001131
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001132 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001133 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001134
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1136nameprep_tests = [
1137 # 3.1 Map to nothing.
1138 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1139 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1140 '\xb8\x8f\xef\xbb\xbf',
1141 'foobarbaz'),
1142 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1143 ('CAFE',
1144 'cafe'),
1145 # 3.3 Case folding 8bit U+00DF (german sharp s).
1146 # The original test case is bogus; it says \xc3\xdf
1147 ('\xc3\x9f',
1148 'ss'),
1149 # 3.4 Case folding U+0130 (turkish capital I with dot).
1150 ('\xc4\xb0',
1151 'i\xcc\x87'),
1152 # 3.5 Case folding multibyte U+0143 U+037A.
1153 ('\xc5\x83\xcd\xba',
1154 '\xc5\x84 \xce\xb9'),
1155 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1156 # XXX: skip this as it fails in UCS-2 mode
1157 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1158 # 'telc\xe2\x88\x95kg\xcf\x83'),
1159 (None, None),
1160 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1161 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1162 '\xc7\xb0 a'),
1163 # 3.8 Case folding U+1FB7 and normalization.
1164 ('\xe1\xbe\xb7',
1165 '\xe1\xbe\xb6\xce\xb9'),
1166 # 3.9 Self-reverting case folding U+01F0 and normalization.
1167 # The original test case is bogus, it says `\xc7\xf0'
1168 ('\xc7\xb0',
1169 '\xc7\xb0'),
1170 # 3.10 Self-reverting case folding U+0390 and normalization.
1171 ('\xce\x90',
1172 '\xce\x90'),
1173 # 3.11 Self-reverting case folding U+03B0 and normalization.
1174 ('\xce\xb0',
1175 '\xce\xb0'),
1176 # 3.12 Self-reverting case folding U+1E96 and normalization.
1177 ('\xe1\xba\x96',
1178 '\xe1\xba\x96'),
1179 # 3.13 Self-reverting case folding U+1F56 and normalization.
1180 ('\xe1\xbd\x96',
1181 '\xe1\xbd\x96'),
1182 # 3.14 ASCII space character U+0020.
1183 (' ',
1184 ' '),
1185 # 3.15 Non-ASCII 8bit space character U+00A0.
1186 ('\xc2\xa0',
1187 ' '),
1188 # 3.16 Non-ASCII multibyte space character U+1680.
1189 ('\xe1\x9a\x80',
1190 None),
1191 # 3.17 Non-ASCII multibyte space character U+2000.
1192 ('\xe2\x80\x80',
1193 ' '),
1194 # 3.18 Zero Width Space U+200b.
1195 ('\xe2\x80\x8b',
1196 ''),
1197 # 3.19 Non-ASCII multibyte space character U+3000.
1198 ('\xe3\x80\x80',
1199 ' '),
1200 # 3.20 ASCII control characters U+0010 U+007F.
1201 ('\x10\x7f',
1202 '\x10\x7f'),
1203 # 3.21 Non-ASCII 8bit control character U+0085.
1204 ('\xc2\x85',
1205 None),
1206 # 3.22 Non-ASCII multibyte control character U+180E.
1207 ('\xe1\xa0\x8e',
1208 None),
1209 # 3.23 Zero Width No-Break Space U+FEFF.
1210 ('\xef\xbb\xbf',
1211 ''),
1212 # 3.24 Non-ASCII control character U+1D175.
1213 ('\xf0\x9d\x85\xb5',
1214 None),
1215 # 3.25 Plane 0 private use character U+F123.
1216 ('\xef\x84\xa3',
1217 None),
1218 # 3.26 Plane 15 private use character U+F1234.
1219 ('\xf3\xb1\x88\xb4',
1220 None),
1221 # 3.27 Plane 16 private use character U+10F234.
1222 ('\xf4\x8f\x88\xb4',
1223 None),
1224 # 3.28 Non-character code point U+8FFFE.
1225 ('\xf2\x8f\xbf\xbe',
1226 None),
1227 # 3.29 Non-character code point U+10FFFF.
1228 ('\xf4\x8f\xbf\xbf',
1229 None),
1230 # 3.30 Surrogate code U+DF42.
1231 ('\xed\xbd\x82',
1232 None),
1233 # 3.31 Non-plain text character U+FFFD.
1234 ('\xef\xbf\xbd',
1235 None),
1236 # 3.32 Ideographic description character U+2FF5.
1237 ('\xe2\xbf\xb5',
1238 None),
1239 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001240 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241 '\xcc\x81'),
1242 # 3.34 Left-to-right mark U+200E.
1243 ('\xe2\x80\x8e',
1244 None),
1245 # 3.35 Deprecated U+202A.
1246 ('\xe2\x80\xaa',
1247 None),
1248 # 3.36 Language tagging character U+E0001.
1249 ('\xf3\xa0\x80\x81',
1250 None),
1251 # 3.37 Language tagging character U+E0042.
1252 ('\xf3\xa0\x81\x82',
1253 None),
1254 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1255 ('foo\xd6\xbebar',
1256 None),
1257 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1258 ('foo\xef\xb5\x90bar',
1259 None),
1260 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1261 ('foo\xef\xb9\xb6bar',
1262 'foo \xd9\x8ebar'),
1263 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1264 ('\xd8\xa71',
1265 None),
1266 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1267 ('\xd8\xa71\xd8\xa8',
1268 '\xd8\xa71\xd8\xa8'),
1269 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001270 # Skip this test as we allow unassigned
1271 #('\xf3\xa0\x80\x82',
1272 # None),
1273 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 # 3.44 Larger test (shrinking).
1275 # Original test case reads \xc3\xdf
1276 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1277 '\xaa\xce\xb0\xe2\x80\x80',
1278 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1279 # 3.45 Larger test (expanding).
1280 # Original test case reads \xc3\x9f
1281 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1282 '\x80',
1283 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1284 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1285 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1286 ]
1287
1288
1289class NameprepTest(unittest.TestCase):
1290 def test_nameprep(self):
1291 from encodings.idna import nameprep
1292 for pos, (orig, prepped) in enumerate(nameprep_tests):
1293 if orig is None:
1294 # Skipped
1295 continue
1296 # The Unicode strings are given in UTF-8
1297 orig = unicode(orig, "utf-8")
1298 if prepped is None:
1299 # Input contains prohibited characters
1300 self.assertRaises(UnicodeError, nameprep, orig)
1301 else:
1302 prepped = unicode(prepped, "utf-8")
1303 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001304 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305 except Exception,e:
1306 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1307
Walter Dörwald78a0be62006-04-14 18:25:39 +00001308class IDNACodecTest(unittest.TestCase):
1309 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1311 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1312 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1313 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001314
1315 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001316 self.assertEqual(u"python.org".encode("idna"), "python.org")
1317 self.assertEqual("python.org.".encode("idna"), "python.org.")
1318 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1319 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001320
Martin v. Löwis8b595142005-08-25 11:03:38 +00001321 def test_stream(self):
1322 import StringIO
1323 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1324 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001325 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001326
Walter Dörwald78a0be62006-04-14 18:25:39 +00001327 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001328 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001329 "".join(codecs.iterdecode("python.org", "idna")),
1330 u"python.org"
1331 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001332 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001333 "".join(codecs.iterdecode("python.org.", "idna")),
1334 u"python.org."
1335 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001336 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001337 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1338 u"pyth\xf6n.org."
1339 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001340 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001341 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1342 u"pyth\xf6n.org."
1343 )
1344
1345 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001346 self.assertEqual(decoder.decode("xn--xam", ), u"")
1347 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1348 self.assertEqual(decoder.decode(u"rg"), u"")
1349 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001350
1351 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001352 self.assertEqual(decoder.decode("xn--xam", ), u"")
1353 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1354 self.assertEqual(decoder.decode("rg."), u"org.")
1355 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001356
1357 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001358 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001359 "".join(codecs.iterencode(u"python.org", "idna")),
1360 "python.org"
1361 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001362 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001363 "".join(codecs.iterencode(u"python.org.", "idna")),
1364 "python.org."
1365 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001366 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001367 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1368 "xn--pythn-mua.org."
1369 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001370 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001371 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1372 "xn--pythn-mua.org."
1373 )
1374
1375 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001376 self.assertEqual(encoder.encode(u"\xe4x"), "")
1377 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1378 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001379
1380 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001381 self.assertEqual(encoder.encode(u"\xe4x"), "")
1382 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1383 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001384
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001385class CodecsModuleTest(unittest.TestCase):
1386
1387 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001388 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001389 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001390 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001391 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001392 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1393
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001394 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001395 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001396 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001397 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001398 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001399 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001400 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1401
1402 def test_register(self):
1403 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001404 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001405
1406 def test_lookup(self):
1407 self.assertRaises(TypeError, codecs.lookup)
1408 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001409 self.assertRaises(LookupError, codecs.lookup, " ")
1410
1411 def test_getencoder(self):
1412 self.assertRaises(TypeError, codecs.getencoder)
1413 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1414
1415 def test_getdecoder(self):
1416 self.assertRaises(TypeError, codecs.getdecoder)
1417 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1418
1419 def test_getreader(self):
1420 self.assertRaises(TypeError, codecs.getreader)
1421 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1422
1423 def test_getwriter(self):
1424 self.assertRaises(TypeError, codecs.getwriter)
1425 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001426
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001427 def test_lookup_issue1813(self):
1428 # Issue #1813: under Turkish locales, lookup of some codecs failed
1429 # because 'I' is lowercased as a dotless "i"
1430 oldlocale = locale.getlocale(locale.LC_CTYPE)
1431 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1432 try:
1433 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1434 except locale.Error:
1435 # Unsupported locale on this system
1436 self.skipTest('test needs Turkish locale')
1437 c = codecs.lookup('ASCII')
1438 self.assertEqual(c.name, 'ascii')
1439
Serhiy Storchaka74a651b2014-12-20 17:42:24 +02001440 def test_all(self):
1441 api = (
1442 "encode", "decode",
1443 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1444 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1445 "getencoder", "getdecoder", "getincrementalencoder",
1446 "getincrementaldecoder", "getreader", "getwriter",
1447 "register_error", "lookup_error",
1448 "strict_errors", "replace_errors", "ignore_errors",
1449 "xmlcharrefreplace_errors", "backslashreplace_errors",
1450 "open", "EncodedFile",
1451 "iterencode", "iterdecode",
1452 "BOM", "BOM_BE", "BOM_LE",
1453 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1454 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1455 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1456 "StreamReaderWriter", "StreamRecoder",
1457 )
1458 self.assertEqual(sorted(api), sorted(codecs.__all__))
1459 for api in codecs.__all__:
1460 getattr(codecs, api)
1461
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001462class StreamReaderTest(unittest.TestCase):
1463
1464 def setUp(self):
1465 self.reader = codecs.getreader('utf-8')
1466 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1467
1468 def test_readlines(self):
1469 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001470 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001471
Georg Brandl8f99f812006-10-29 08:39:22 +00001472class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001473
Georg Brandl8f99f812006-10-29 08:39:22 +00001474 def test_basic(self):
1475 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001476 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001477 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001478
1479 f = StringIO.StringIO()
1480 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1481 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001482 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001483
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001484class Str2StrTest(unittest.TestCase):
1485
1486 def test_read(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001487 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001488 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1489 sout = reader.read()
1490 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001491 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001492
1493 def test_readline(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001494 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001495 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1496 sout = reader.readline()
1497 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001498 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001499
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001500all_unicode_encodings = [
1501 "ascii",
1502 "base64_codec",
1503 "big5",
1504 "big5hkscs",
1505 "charmap",
1506 "cp037",
1507 "cp1006",
1508 "cp1026",
1509 "cp1140",
1510 "cp1250",
1511 "cp1251",
1512 "cp1252",
1513 "cp1253",
1514 "cp1254",
1515 "cp1255",
1516 "cp1256",
1517 "cp1257",
1518 "cp1258",
1519 "cp424",
1520 "cp437",
1521 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001522 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001523 "cp737",
1524 "cp775",
1525 "cp850",
1526 "cp852",
1527 "cp855",
1528 "cp856",
1529 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001530 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001531 "cp860",
1532 "cp861",
1533 "cp862",
1534 "cp863",
1535 "cp864",
1536 "cp865",
1537 "cp866",
1538 "cp869",
1539 "cp874",
1540 "cp875",
1541 "cp932",
1542 "cp949",
1543 "cp950",
1544 "euc_jis_2004",
1545 "euc_jisx0213",
1546 "euc_jp",
1547 "euc_kr",
1548 "gb18030",
1549 "gb2312",
1550 "gbk",
1551 "hex_codec",
1552 "hp_roman8",
1553 "hz",
1554 "idna",
1555 "iso2022_jp",
1556 "iso2022_jp_1",
1557 "iso2022_jp_2",
1558 "iso2022_jp_2004",
1559 "iso2022_jp_3",
1560 "iso2022_jp_ext",
1561 "iso2022_kr",
1562 "iso8859_1",
1563 "iso8859_10",
1564 "iso8859_11",
1565 "iso8859_13",
1566 "iso8859_14",
1567 "iso8859_15",
1568 "iso8859_16",
1569 "iso8859_2",
1570 "iso8859_3",
1571 "iso8859_4",
1572 "iso8859_5",
1573 "iso8859_6",
1574 "iso8859_7",
1575 "iso8859_8",
1576 "iso8859_9",
1577 "johab",
1578 "koi8_r",
1579 "koi8_u",
1580 "latin_1",
1581 "mac_cyrillic",
1582 "mac_greek",
1583 "mac_iceland",
1584 "mac_latin2",
1585 "mac_roman",
1586 "mac_turkish",
1587 "palmos",
1588 "ptcp154",
1589 "punycode",
1590 "raw_unicode_escape",
1591 "rot_13",
1592 "shift_jis",
1593 "shift_jis_2004",
1594 "shift_jisx0213",
1595 "tis_620",
1596 "unicode_escape",
1597 "unicode_internal",
1598 "utf_16",
1599 "utf_16_be",
1600 "utf_16_le",
1601 "utf_7",
1602 "utf_8",
1603]
1604
1605if hasattr(codecs, "mbcs_encode"):
1606 all_unicode_encodings.append("mbcs")
1607
1608# The following encodings work only with str, not unicode
1609all_string_encodings = [
1610 "quopri_codec",
1611 "string_escape",
1612 "uu_codec",
1613]
1614
1615# The following encoding is not tested, because it's not supposed
1616# to work:
1617# "undefined"
1618
1619# The following encodings don't work in stateful mode
1620broken_unicode_with_streams = [
1621 "base64_codec",
1622 "hex_codec",
1623 "punycode",
1624 "unicode_internal"
1625]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001626broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001627
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001628if sys.flags.py3k_warning:
1629 broken_unicode_with_streams.append("rot_13")
1630
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001631# The following encodings only support "strict" mode
1632only_strict_mode = [
1633 "idna",
1634 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001635 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001636]
1637
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001638try:
1639 import bz2
1640except ImportError:
1641 pass
1642else:
1643 all_unicode_encodings.append("bz2_codec")
1644 broken_unicode_with_streams.append("bz2_codec")
1645
1646try:
1647 import zlib
1648except ImportError:
1649 pass
1650else:
1651 all_unicode_encodings.append("zlib_codec")
1652 broken_unicode_with_streams.append("zlib_codec")
1653
1654class BasicUnicodeTest(unittest.TestCase):
1655 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001656 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001657 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001658 name = codecs.lookup(encoding).name
1659 if encoding.endswith("_codec"):
1660 name += "_codec"
1661 elif encoding == "latin_1":
1662 name = "latin_1"
1663 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001664 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001665 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001666 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001667 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001668
1669 if encoding not in broken_unicode_with_streams:
1670 # check stream reader/writer
1671 q = Queue()
1672 writer = codecs.getwriter(encoding)(q)
1673 encodedresult = ""
1674 for c in s:
1675 writer.write(c)
1676 encodedresult += q.read()
1677 q = Queue()
1678 reader = codecs.getreader(encoding)(q)
1679 decodedresult = u""
1680 for c in encodedresult:
1681 q.write(c)
1682 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001683 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001684
Georg Brandl2c9838e2006-10-29 14:39:09 +00001685 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001686 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001687 try:
1688 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001689 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001690 pass
1691 else:
1692 # check incremental decoder/encoder
1693 encodedresult = ""
1694 for c in s:
1695 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001696 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001697 decoder = codecs.getincrementaldecoder(encoding)()
1698 decodedresult = u""
1699 for c in encodedresult:
1700 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001701 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001702 self.assertEqual(decodedresult, s,
1703 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001704
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001705 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001706 result = u"".join(codecs.iterdecode(
1707 codecs.iterencode(s, encoding), encoding))
1708 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001709
1710 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001711 result = u"".join(codecs.iterdecode(
1712 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001713 self.assertEqual(result, u"")
1714
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001715 if encoding not in only_strict_mode:
1716 # check incremental decoder/encoder with errors argument
1717 try:
1718 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001719 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001720 pass
1721 else:
1722 encodedresult = "".join(encoder.encode(c) for c in s)
1723 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001724 decodedresult = u"".join(decoder.decode(c)
1725 for c in encodedresult)
1726 self.assertEqual(decodedresult, s,
1727 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001728
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001729 @test_support.cpython_only
1730 def test_basics_capi(self):
1731 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1732 s = u"abc123" # all codecs should be able to encode these
1733 for encoding in all_unicode_encodings:
1734 if encoding not in broken_incremental_coders:
1735 # check incremental decoder/encoder and iterencode()/iterdecode()
1736 try:
1737 cencoder = codec_incrementalencoder(encoding)
1738 except LookupError: # no IncrementalEncoder
1739 pass
1740 else:
1741 # check C API
1742 encodedresult = ""
1743 for c in s:
1744 encodedresult += cencoder.encode(c)
1745 encodedresult += cencoder.encode(u"", True)
1746 cdecoder = codec_incrementaldecoder(encoding)
1747 decodedresult = u""
1748 for c in encodedresult:
1749 decodedresult += cdecoder.decode(c)
1750 decodedresult += cdecoder.decode("", True)
1751 self.assertEqual(decodedresult, s,
1752 "encoding=%r" % encoding)
1753
1754 if encoding not in only_strict_mode:
1755 # check incremental decoder/encoder with errors argument
1756 try:
1757 cencoder = codec_incrementalencoder(encoding, "ignore")
1758 except LookupError: # no IncrementalEncoder
1759 pass
1760 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001761 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001762 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1763 decodedresult = u"".join(cdecoder.decode(c)
1764 for c in encodedresult)
1765 self.assertEqual(decodedresult, s,
1766 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001767
Walter Dörwald729c31f2005-03-14 19:06:30 +00001768 def test_seek(self):
1769 # all codecs should be able to encode these
1770 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1771 for encoding in all_unicode_encodings:
1772 if encoding == "idna": # FIXME: See SF bug #1163178
1773 continue
1774 if encoding in broken_unicode_with_streams:
1775 continue
1776 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1777 for t in xrange(5):
1778 # Test that calling seek resets the internal codec state and buffers
1779 reader.seek(0, 0)
1780 line = reader.readline()
1781 self.assertEqual(s[:len(line)], line)
1782
Walter Dörwalde22d3392005-11-17 08:52:34 +00001783 def test_bad_decode_args(self):
1784 for encoding in all_unicode_encodings:
1785 decoder = codecs.getdecoder(encoding)
1786 self.assertRaises(TypeError, decoder)
1787 if encoding not in ("idna", "punycode"):
1788 self.assertRaises(TypeError, decoder, 42)
1789
1790 def test_bad_encode_args(self):
1791 for encoding in all_unicode_encodings:
1792 encoder = codecs.getencoder(encoding)
1793 self.assertRaises(TypeError, encoder)
1794
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001795 def test_encoding_map_type_initialized(self):
1796 from encodings import cp1140
1797 # This used to crash, we are only verifying there's no crash.
1798 table_type = type(cp1140.encoding_table)
1799 self.assertEqual(table_type, table_type)
1800
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001801class BasicStrTest(unittest.TestCase):
1802 def test_basics(self):
1803 s = "abc123"
1804 for encoding in all_string_encodings:
1805 (bytes, size) = codecs.getencoder(encoding)(s)
1806 self.assertEqual(size, len(s))
1807 (chars, size) = codecs.getdecoder(encoding)(bytes)
1808 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1809
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001810class CharmapTest(unittest.TestCase):
1811 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001812 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001813 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1814 (u"abc", 3)
1815 )
1816
Serhiy Storchaka95997452013-01-15 14:42:59 +02001817 self.assertRaises(UnicodeDecodeError,
1818 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1819 )
1820
1821 self.assertRaises(UnicodeDecodeError,
1822 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1823 )
1824
Ezio Melotti2623a372010-11-21 13:34:58 +00001825 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001826 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1827 (u"ab\ufffd", 3)
1828 )
1829
Ezio Melotti2623a372010-11-21 13:34:58 +00001830 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001831 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1832 (u"ab\ufffd", 3)
1833 )
1834
Ezio Melotti2623a372010-11-21 13:34:58 +00001835 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001836 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1837 (u"ab", 3)
1838 )
1839
Ezio Melotti2623a372010-11-21 13:34:58 +00001840 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001841 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1842 (u"ab", 3)
1843 )
1844
1845 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001846 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001847 codecs.charmap_decode(allbytes, "ignore", u""),
1848 (u"", len(allbytes))
1849 )
1850
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001851 def test_decode_with_int2str_map(self):
1852 self.assertEqual(
1853 codecs.charmap_decode("\x00\x01\x02", "strict",
1854 {0: u'a', 1: u'b', 2: u'c'}),
1855 (u"abc", 3)
1856 )
1857
1858 self.assertEqual(
1859 codecs.charmap_decode("\x00\x01\x02", "strict",
1860 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1861 (u"AaBbCc", 3)
1862 )
1863
1864 self.assertEqual(
1865 codecs.charmap_decode("\x00\x01\x02", "strict",
1866 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1867 (u"\U0010FFFFbc", 3)
1868 )
1869
1870 self.assertEqual(
1871 codecs.charmap_decode("\x00\x01\x02", "strict",
1872 {0: u'a', 1: u'b', 2: u''}),
1873 (u"ab", 3)
1874 )
1875
1876 self.assertRaises(UnicodeDecodeError,
1877 codecs.charmap_decode, "\x00\x01\x02", "strict",
1878 {0: u'a', 1: u'b'}
1879 )
1880
Serhiy Storchaka95997452013-01-15 14:42:59 +02001881 self.assertRaises(UnicodeDecodeError,
1882 codecs.charmap_decode, "\x00\x01\x02", "strict",
1883 {0: u'a', 1: u'b', 2: None}
1884 )
1885
1886 # Issue #14850
1887 self.assertRaises(UnicodeDecodeError,
1888 codecs.charmap_decode, "\x00\x01\x02", "strict",
1889 {0: u'a', 1: u'b', 2: u'\ufffe'}
1890 )
1891
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001892 self.assertEqual(
1893 codecs.charmap_decode("\x00\x01\x02", "replace",
1894 {0: u'a', 1: u'b'}),
1895 (u"ab\ufffd", 3)
1896 )
1897
1898 self.assertEqual(
1899 codecs.charmap_decode("\x00\x01\x02", "replace",
1900 {0: u'a', 1: u'b', 2: None}),
1901 (u"ab\ufffd", 3)
1902 )
1903
Serhiy Storchaka95997452013-01-15 14:42:59 +02001904 # Issue #14850
1905 self.assertEqual(
1906 codecs.charmap_decode("\x00\x01\x02", "replace",
1907 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1908 (u"ab\ufffd", 3)
1909 )
1910
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001911 self.assertEqual(
1912 codecs.charmap_decode("\x00\x01\x02", "ignore",
1913 {0: u'a', 1: u'b'}),
1914 (u"ab", 3)
1915 )
1916
1917 self.assertEqual(
1918 codecs.charmap_decode("\x00\x01\x02", "ignore",
1919 {0: u'a', 1: u'b', 2: None}),
1920 (u"ab", 3)
1921 )
1922
Serhiy Storchaka95997452013-01-15 14:42:59 +02001923 # Issue #14850
1924 self.assertEqual(
1925 codecs.charmap_decode("\x00\x01\x02", "ignore",
1926 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1927 (u"ab", 3)
1928 )
1929
1930 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001931 self.assertEqual(
1932 codecs.charmap_decode(allbytes, "ignore", {}),
1933 (u"", len(allbytes))
1934 )
1935
1936 def test_decode_with_int2int_map(self):
1937 a = ord(u'a')
1938 b = ord(u'b')
1939 c = ord(u'c')
1940
1941 self.assertEqual(
1942 codecs.charmap_decode("\x00\x01\x02", "strict",
1943 {0: a, 1: b, 2: c}),
1944 (u"abc", 3)
1945 )
1946
1947 # Issue #15379
1948 self.assertEqual(
1949 codecs.charmap_decode("\x00\x01\x02", "strict",
1950 {0: 0x10FFFF, 1: b, 2: c}),
1951 (u"\U0010FFFFbc", 3)
1952 )
1953
1954 self.assertRaises(TypeError,
1955 codecs.charmap_decode, "\x00\x01\x02", "strict",
1956 {0: 0x110000, 1: b, 2: c}
1957 )
1958
1959 self.assertRaises(UnicodeDecodeError,
1960 codecs.charmap_decode, "\x00\x01\x02", "strict",
1961 {0: a, 1: b},
1962 )
1963
Serhiy Storchaka95997452013-01-15 14:42:59 +02001964 self.assertRaises(UnicodeDecodeError,
1965 codecs.charmap_decode, "\x00\x01\x02", "strict",
1966 {0: a, 1: b, 2: 0xFFFE},
1967 )
1968
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001969 self.assertEqual(
1970 codecs.charmap_decode("\x00\x01\x02", "replace",
1971 {0: a, 1: b}),
1972 (u"ab\ufffd", 3)
1973 )
1974
1975 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001976 codecs.charmap_decode("\x00\x01\x02", "replace",
1977 {0: a, 1: b, 2: 0xFFFE}),
1978 (u"ab\ufffd", 3)
1979 )
1980
1981 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001982 codecs.charmap_decode("\x00\x01\x02", "ignore",
1983 {0: a, 1: b}),
1984 (u"ab", 3)
1985 )
1986
Serhiy Storchaka95997452013-01-15 14:42:59 +02001987 self.assertEqual(
1988 codecs.charmap_decode("\x00\x01\x02", "ignore",
1989 {0: a, 1: b, 2: 0xFFFE}),
1990 (u"ab", 3)
1991 )
1992
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001993
Georg Brandl8f99f812006-10-29 08:39:22 +00001994class WithStmtTest(unittest.TestCase):
1995 def test_encodedfile(self):
1996 f = StringIO.StringIO("\xc3\xbc")
1997 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001998 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001999
2000 def test_streamreaderwriter(self):
2001 f = StringIO.StringIO("\xc3\xbc")
2002 info = codecs.lookup("utf-8")
2003 with codecs.StreamReaderWriter(f, info.streamreader,
2004 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00002005 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00002006
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002007
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002008class UnicodeEscapeTest(unittest.TestCase):
2009 def test_empty(self):
2010 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
2011 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
2012
2013 def test_raw_encode(self):
2014 encode = codecs.unicode_escape_encode
2015 for b in range(32, 127):
2016 if b != ord('\\'):
2017 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2018
2019 def test_raw_decode(self):
2020 decode = codecs.unicode_escape_decode
2021 for b in range(256):
2022 if b != ord('\\'):
2023 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2024
2025 def test_escape_encode(self):
2026 encode = codecs.unicode_escape_encode
2027 check = coding_checker(self, encode)
2028 check(u'\t', r'\t')
2029 check(u'\n', r'\n')
2030 check(u'\r', r'\r')
2031 check(u'\\', r'\\')
2032 for b in range(32):
2033 if chr(b) not in '\t\n\r':
2034 check(unichr(b), '\\x%02x' % b)
2035 for b in range(127, 256):
2036 check(unichr(b), '\\x%02x' % b)
2037 check(u'\u20ac', r'\u20ac')
2038 check(u'\U0001d120', r'\U0001d120')
2039
2040 def test_escape_decode(self):
2041 decode = codecs.unicode_escape_decode
2042 check = coding_checker(self, decode)
2043 check("[\\\n]", u"[]")
2044 check(r'[\"]', u'["]')
2045 check(r"[\']", u"[']")
2046 check(r"[\\]", ur"[\]")
2047 check(r"[\a]", u"[\x07]")
2048 check(r"[\b]", u"[\x08]")
2049 check(r"[\t]", u"[\x09]")
2050 check(r"[\n]", u"[\x0a]")
2051 check(r"[\v]", u"[\x0b]")
2052 check(r"[\f]", u"[\x0c]")
2053 check(r"[\r]", u"[\x0d]")
2054 check(r"[\7]", u"[\x07]")
2055 check(r"[\8]", ur"[\8]")
2056 check(r"[\78]", u"[\x078]")
2057 check(r"[\41]", u"[!]")
2058 check(r"[\418]", u"[!8]")
2059 check(r"[\101]", u"[A]")
2060 check(r"[\1010]", u"[A0]")
2061 check(r"[\x41]", u"[A]")
2062 check(r"[\x410]", u"[A0]")
2063 check(r"\u20ac", u"\u20ac")
2064 check(r"\U0001d120", u"\U0001d120")
2065 for b in range(256):
2066 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
2067 check('\\' + chr(b), u'\\' + unichr(b))
2068
2069 def test_decode_errors(self):
2070 decode = codecs.unicode_escape_decode
2071 for c, d in ('x', 2), ('u', 4), ('U', 4):
2072 for i in range(d):
2073 self.assertRaises(UnicodeDecodeError, decode,
2074 "\\" + c + "0"*i)
2075 self.assertRaises(UnicodeDecodeError, decode,
2076 "[\\" + c + "0"*i + "]")
2077 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2078 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2079 self.assertEqual(decode(data, "replace"),
2080 (u"[\ufffd]\ufffd", len(data)))
2081 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2082 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2083 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2084
2085
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002086class RawUnicodeEscapeTest(unittest.TestCase):
2087 def test_empty(self):
2088 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
2089 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
2090
2091 def test_raw_encode(self):
2092 encode = codecs.raw_unicode_escape_encode
2093 for b in range(256):
2094 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2095
2096 def test_raw_decode(self):
2097 decode = codecs.raw_unicode_escape_decode
2098 for b in range(256):
2099 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2100
2101 def test_escape_encode(self):
2102 encode = codecs.raw_unicode_escape_encode
2103 check = coding_checker(self, encode)
2104 for b in range(256):
2105 if chr(b) not in 'uU':
2106 check(u'\\' + unichr(b), '\\' + chr(b))
2107 check(u'\u20ac', r'\u20ac')
2108 check(u'\U0001d120', r'\U0001d120')
2109
2110 def test_escape_decode(self):
2111 decode = codecs.raw_unicode_escape_decode
2112 check = coding_checker(self, decode)
2113 for b in range(256):
2114 if chr(b) not in 'uU':
2115 check('\\' + chr(b), u'\\' + unichr(b))
2116 check(r"\u20ac", u"\u20ac")
2117 check(r"\U0001d120", u"\U0001d120")
2118
2119 def test_decode_errors(self):
2120 decode = codecs.raw_unicode_escape_decode
2121 for c, d in ('u', 4), ('U', 4):
2122 for i in range(d):
2123 self.assertRaises(UnicodeDecodeError, decode,
2124 "\\" + c + "0"*i)
2125 self.assertRaises(UnicodeDecodeError, decode,
2126 "[\\" + c + "0"*i + "]")
2127 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2128 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2129 self.assertEqual(decode(data, "replace"),
2130 (u"[\ufffd]\ufffd", len(data)))
2131 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2132 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2133 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2134
2135
Victor Stinner262be5e2010-05-22 02:11:07 +00002136class BomTest(unittest.TestCase):
2137 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002138 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002139 tests = ("utf-16",
2140 "utf-16-le",
2141 "utf-16-be",
2142 "utf-32",
2143 "utf-32-le",
2144 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002145 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002146 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002147 # Check if the BOM is written only once
2148 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002149 f.write(data)
2150 f.write(data)
2151 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002152 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002153 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002154 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002155
Victor Stinner7df55da2010-05-22 13:37:56 +00002156 # Check that the BOM is written after a seek(0)
2157 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2158 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002159 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002160 f.seek(0)
2161 f.write(data)
2162 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002163 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002164
2165 # (StreamWriter) Check that the BOM is written after a seek(0)
2166 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2167 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002168 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002169 f.writer.seek(0)
2170 f.writer.write(data)
2171 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002172 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002173
2174 # Check that the BOM is not written after a seek() at a position
2175 # different than the start
2176 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2177 f.write(data)
2178 f.seek(f.tell())
2179 f.write(data)
2180 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002181 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002182
2183 # (StreamWriter) Check that the BOM is not written after a seek()
2184 # at a position different than the start
2185 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2186 f.writer.write(data)
2187 f.writer.seek(f.writer.tell())
2188 f.writer.write(data)
2189 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002190 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002191
Victor Stinner262be5e2010-05-22 02:11:07 +00002192
Martin Panter90bc71f2015-09-12 02:20:06 +00002193class TransformCodecTest(unittest.TestCase):
2194
Martin Panterb2528c92015-09-12 00:34:28 +00002195 def test_quopri_stateless(self):
2196 # Should encode with quotetabs=True
2197 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2198 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2199 # But should still support unescaped tabs and spaces
2200 unescaped = b"space tab eol\n"
2201 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2202
Martin Panter90bc71f2015-09-12 02:20:06 +00002203 def test_uu_invalid(self):
2204 # Missing "begin" line
2205 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2206
2207
Fred Drake2e2be372001-09-20 21:33:42 +00002208def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002209 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002210 UTF32Test,
2211 UTF32LETest,
2212 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002213 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002214 UTF16LETest,
2215 UTF16BETest,
2216 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002217 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002218 UTF7Test,
2219 UTF16ExTest,
2220 ReadBufferTest,
2221 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002222 EscapeDecodeTest,
2223 RecodingTest,
2224 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002225 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002226 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002227 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002228 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002229 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002230 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002231 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002232 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002233 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002234 CharmapTest,
2235 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002236 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002237 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002238 BomTest,
Martin Panter90bc71f2015-09-12 02:20:06 +00002239 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002240 )
Fred Drake2e2be372001-09-20 21:33:42 +00002241
2242
2243if __name__ == "__main__":
2244 test_main()