blob: 57d5e066263352d66d945773e7a6276593e658e9 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300576 try:
577 with self.assertRaises(UnicodeDecodeError):
578 codecs.utf_16_le_decode(raw, 'strict', True)
579 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
580 except:
581 print 'raw=%r' % raw
582 raise
Walter Dörwalde22d3392005-11-17 08:52:34 +0000583
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000584class UTF16BETest(ReadTest):
585 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000586
587 def test_partial(self):
588 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200589 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000590 [
591 u"",
592 u"\x00",
593 u"\x00",
594 u"\x00\xff",
595 u"\x00\xff",
596 u"\x00\xff\u0100",
597 u"\x00\xff\u0100",
598 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200599 u"\x00\xff\u0100\uffff",
600 u"\x00\xff\u0100\uffff",
601 u"\x00\xff\u0100\uffff",
602 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 ]
604 )
605
Walter Dörwalde22d3392005-11-17 08:52:34 +0000606 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200607 tests = [
608 (b'\xff', u'\ufffd'),
609 (b'\x00A\xff', u'A\ufffd'),
610 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
611 (b'\xd8\x00', u'\ufffd'),
612 (b'\xd8\x00\xdc', u'\ufffd'),
613 (b'\xd8\x00\x00A', u'\ufffdA'),
614 (b'\xdc\x00\x00A', u'\ufffdA'),
615 ]
616 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300617 try:
618 with self.assertRaises(UnicodeDecodeError):
619 codecs.utf_16_be_decode(raw, 'strict', True)
620 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
621 except:
622 print 'raw=%r' % raw
623 raise
Walter Dörwalde22d3392005-11-17 08:52:34 +0000624
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000625class UTF8Test(ReadTest):
626 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000627
628 def test_partial(self):
629 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200630 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000631 [
632 u"\x00",
633 u"\x00",
634 u"\x00\xff",
635 u"\x00\xff",
636 u"\x00\xff\u07ff",
637 u"\x00\xff\u07ff",
638 u"\x00\xff\u07ff",
639 u"\x00\xff\u07ff\u0800",
640 u"\x00\xff\u07ff\u0800",
641 u"\x00\xff\u07ff\u0800",
642 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200643 u"\x00\xff\u07ff\u0800\uffff",
644 u"\x00\xff\u07ff\u0800\uffff",
645 u"\x00\xff\u07ff\u0800\uffff",
646 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000647 ]
648 )
649
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650class UTF7Test(ReadTest):
651 encoding = "utf-7"
652
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300653 def test_ascii(self):
654 # Set D (directly encoded characters)
655 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
656 'abcdefghijklmnopqrstuvwxyz'
657 '0123456789'
658 '\'(),-./:?')
659 self.assertEqual(set_d.encode(self.encoding), set_d)
660 self.assertEqual(set_d.decode(self.encoding), set_d)
661 # Set O (optional direct characters)
662 set_o = ' !"#$%&*;<=>@[]^_`{|}'
663 self.assertEqual(set_o.encode(self.encoding), set_o)
664 self.assertEqual(set_o.decode(self.encoding), set_o)
665 # +
666 self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
667 self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
668 # White spaces
669 ws = ' \t\n\r'
670 self.assertEqual(ws.encode(self.encoding), ws)
671 self.assertEqual(ws.decode(self.encoding), ws)
672 # Other ASCII characters
673 other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
674 set(set_d + set_o + '+' + ws)))
675 self.assertEqual(other_ascii.encode(self.encoding),
676 '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
677 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
678
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000679 def test_partial(self):
680 self.check_partial(
681 u"a+-b",
682 [
683 u"a",
684 u"a",
685 u"a+",
686 u"a+-",
687 u"a+-b",
688 ]
689 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000690
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300691 def test_errors(self):
692 tests = [
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300693 ('\xe1b', u'\ufffdb'),
694 ('a\xe1b', u'a\ufffdb'),
695 ('a\xe1\xe1b', u'a\ufffd\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300696 ('a+IK', u'a\ufffd'),
697 ('a+IK-b', u'a\ufffdb'),
698 ('a+IK,b', u'a\ufffdb'),
699 ('a+IKx', u'a\u20ac\ufffd'),
700 ('a+IKx-b', u'a\u20ac\ufffdb'),
701 ('a+IKwgr', u'a\u20ac\ufffd'),
702 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
703 ('a+IKwgr,', u'a\u20ac\ufffd'),
704 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
705 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
706 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
707 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
708 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
709 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
710 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300711 ('a+IKw-b\xe1', u'a\u20acb\ufffd'),
712 ('a+IKw\xe1b', u'a\u20ac\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300713 ]
714 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300715 try:
716 with self.assertRaises(UnicodeDecodeError):
717 codecs.utf_7_decode(raw, 'strict', True)
718 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
719 except:
720 print 'raw=%r' % raw
721 raise
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300722
723 def test_nonbmp(self):
724 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
725 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
726 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300727 self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
728 self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
729 self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
730 self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
731 self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
732 '+IKwgrNgB3KA-')
733 self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
734 u'\u20ac\u20ac\U000104A0')
735 self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
736 u'\u20ac\u20ac\U000104A0')
737
738 def test_lone_surrogates(self):
739 tests = [
740 ('a+2AE-b', u'a\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300741 ('a+2AE\xe1b', u'a\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300742 ('a+2AE', u'a\ufffd'),
743 ('a+2AEA-b', u'a\ufffdb'),
744 ('a+2AH-b', u'a\ufffdb'),
745 ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300746 ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300747 ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
748 ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
749 ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
Serhiy Storchaka462502b2015-10-10 09:33:11 +0300750 ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300751 ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
752 ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
753 ]
754 for raw, expected in tests:
Serhiy Storchaka0451fb92015-10-04 13:52:40 +0300755 try:
756 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
757 except:
758 print 'raw=%r' % raw
759 raise
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300760
Walter Dörwalde22d3392005-11-17 08:52:34 +0000761class UTF16ExTest(unittest.TestCase):
762
763 def test_errors(self):
764 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
765
766 def test_bad_args(self):
767 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
768
769class ReadBufferTest(unittest.TestCase):
770
771 def test_array(self):
772 import array
773 self.assertEqual(
774 codecs.readbuffer_encode(array.array("c", "spam")),
775 ("spam", 4)
776 )
777
778 def test_empty(self):
779 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
780
781 def test_bad_args(self):
782 self.assertRaises(TypeError, codecs.readbuffer_encode)
783 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
784
785class CharBufferTest(unittest.TestCase):
786
787 def test_string(self):
788 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
789
790 def test_empty(self):
791 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
792
793 def test_bad_args(self):
794 self.assertRaises(TypeError, codecs.charbuffer_encode)
795 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
796
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000797class UTF8SigTest(ReadTest):
798 encoding = "utf-8-sig"
799
800 def test_partial(self):
801 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200802 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000803 [
804 u"",
805 u"",
806 u"", # First BOM has been read and skipped
807 u"",
808 u"",
809 u"\ufeff", # Second BOM has been read and emitted
810 u"\ufeff\x00", # "\x00" read and emitted
811 u"\ufeff\x00", # First byte of encoded u"\xff" read
812 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
813 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
814 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
815 u"\ufeff\x00\xff\u07ff",
816 u"\ufeff\x00\xff\u07ff",
817 u"\ufeff\x00\xff\u07ff\u0800",
818 u"\ufeff\x00\xff\u07ff\u0800",
819 u"\ufeff\x00\xff\u07ff\u0800",
820 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200821 u"\ufeff\x00\xff\u07ff\u0800\uffff",
822 u"\ufeff\x00\xff\u07ff\u0800\uffff",
823 u"\ufeff\x00\xff\u07ff\u0800\uffff",
824 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000825 ]
826 )
827
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000828 def test_bug1601501(self):
829 # SF bug #1601501: check that the codec works with a buffer
830 unicode("\xef\xbb\xbf", "utf-8-sig")
831
Walter Dörwald42348272007-04-12 10:35:00 +0000832 def test_bom(self):
833 d = codecs.getincrementaldecoder("utf-8-sig")()
834 s = u"spam"
835 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
836
Walter Dörwald183744d2007-11-19 12:41:10 +0000837 def test_stream_bom(self):
838 unistring = u"ABC\u00A1\u2200XYZ"
839 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
840
841 reader = codecs.getreader("utf-8-sig")
842 for sizehint in [None] + range(1, 11) + \
843 [64, 128, 256, 512, 1024]:
844 istream = reader(StringIO.StringIO(bytestring))
845 ostream = StringIO.StringIO()
846 while 1:
847 if sizehint is not None:
848 data = istream.read(sizehint)
849 else:
850 data = istream.read()
851
852 if not data:
853 break
854 ostream.write(data)
855
856 got = ostream.getvalue()
857 self.assertEqual(got, unistring)
858
859 def test_stream_bare(self):
860 unistring = u"ABC\u00A1\u2200XYZ"
861 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
862
863 reader = codecs.getreader("utf-8-sig")
864 for sizehint in [None] + range(1, 11) + \
865 [64, 128, 256, 512, 1024]:
866 istream = reader(StringIO.StringIO(bytestring))
867 ostream = StringIO.StringIO()
868 while 1:
869 if sizehint is not None:
870 data = istream.read(sizehint)
871 else:
872 data = istream.read()
873
874 if not data:
875 break
876 ostream.write(data)
877
878 got = ostream.getvalue()
879 self.assertEqual(got, unistring)
880
Walter Dörwald8709a422002-09-03 13:53:40 +0000881class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000882 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000883 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000884
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200885 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200886 decode = codecs.escape_decode
887 for b in range(256):
888 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200889 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200890 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200891
892 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200893 decode = codecs.escape_decode
894 check = coding_checker(self, decode)
895 check(b"[\\\n]", b"[]")
896 check(br'[\"]', b'["]')
897 check(br"[\']", b"[']")
898 check(br"[\\]", br"[\]")
899 check(br"[\a]", b"[\x07]")
900 check(br"[\b]", b"[\x08]")
901 check(br"[\t]", b"[\x09]")
902 check(br"[\n]", b"[\x0a]")
903 check(br"[\v]", b"[\x0b]")
904 check(br"[\f]", b"[\x0c]")
905 check(br"[\r]", b"[\x0d]")
906 check(br"[\7]", b"[\x07]")
907 check(br"[\8]", br"[\8]")
908 check(br"[\78]", b"[\x078]")
909 check(br"[\41]", b"[!]")
910 check(br"[\418]", b"[!8]")
911 check(br"[\101]", b"[A]")
912 check(br"[\1010]", b"[A0]")
913 check(br"[\501]", b"[A]")
914 check(br"[\x41]", b"[A]")
915 check(br"[\X41]", br"[\X41]")
916 check(br"[\x410]", b"[A0]")
917 for b in range(256):
918 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200919 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200920 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200921
922 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200923 decode = codecs.escape_decode
924 self.assertRaises(ValueError, decode, br"\x")
925 self.assertRaises(ValueError, decode, br"[\x]")
926 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
927 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
928 self.assertRaises(ValueError, decode, br"\x0")
929 self.assertRaises(ValueError, decode, br"[\x0]")
930 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
931 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200932
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000933class RecodingTest(unittest.TestCase):
934 def test_recoding(self):
935 f = StringIO.StringIO()
936 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
937 f2.write(u"a")
938 f2.close()
939 # Python used to crash on this at exit because of a refcount
940 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000941
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942# From RFC 3492
943punycode_testcases = [
944 # A Arabic (Egyptian):
945 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
946 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
947 "egbpdaj6bu4bxfgehfvwxn"),
948 # B Chinese (simplified):
949 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
950 "ihqwcrb4cv8a8dqg056pqjye"),
951 # C Chinese (traditional):
952 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
953 "ihqwctvzc91f659drss3x8bo0yb"),
954 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
955 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
956 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
957 u"\u0065\u0073\u006B\u0079",
958 "Proprostnemluvesky-uyb24dma41a"),
959 # E Hebrew:
960 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
961 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
962 u"\u05D1\u05E8\u05D9\u05EA",
963 "4dbcagdahymbxekheh6e0a7fei0b"),
964 # F Hindi (Devanagari):
965 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
966 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
967 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
968 u"\u0939\u0948\u0902",
969 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
970
971 #(G) Japanese (kanji and hiragana):
972 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
973 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
974 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
975
976 # (H) Korean (Hangul syllables):
977 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
978 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
979 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
980 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
981 "psd879ccm6fea98c"),
982
983 # (I) Russian (Cyrillic):
984 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
985 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
986 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
987 u"\u0438",
988 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
989
990 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
991 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
992 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
993 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
994 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
995 u"\u0061\u00F1\u006F\u006C",
996 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
997
998 # (K) Vietnamese:
999 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1000 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1001 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1002 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1003 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1004 u"\u0056\u0069\u1EC7\u0074",
1005 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1006
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 #(L) 3<nen>B<gumi><kinpachi><sensei>
1008 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1009 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001010
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # (M) <amuro><namie>-with-SUPER-MONKEYS
1012 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1013 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1014 u"\u004F\u004E\u004B\u0045\u0059\u0053",
1015 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1016
1017 # (N) Hello-Another-Way-<sorezore><no><basho>
1018 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1019 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1020 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1021 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1022
1023 # (O) <hitotsu><yane><no><shita>2
1024 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1025 "2-u9tlzr9756bt3uc0v"),
1026
1027 # (P) Maji<de>Koi<suru>5<byou><mae>
1028 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1029 u"\u308B\u0035\u79D2\u524D",
1030 "MajiKoi5-783gue6qz075azm5e"),
1031
1032 # (Q) <pafii>de<runba>
1033 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1034 "de-jg4avhby1noc0d"),
1035
1036 # (R) <sono><supiido><de>
1037 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1038 "d9juau41awczczp"),
1039
1040 # (S) -> $1.00 <-
1041 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1042 u"\u003C\u002D",
1043 "-> $1.00 <--")
1044 ]
1045
1046for i in punycode_testcases:
1047 if len(i)!=2:
1048 print repr(i)
1049
1050class PunycodeTest(unittest.TestCase):
1051 def test_encode(self):
1052 for uni, puny in punycode_testcases:
1053 # Need to convert both strings to lower case, since
1054 # some of the extended encodings use upper case, but our
1055 # code produces only lower case. Converting just puny to
1056 # lower is also insufficient, since some of the input characters
1057 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +00001058 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059
1060 def test_decode(self):
1061 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +00001062 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001064class UnicodeInternalTest(unittest.TestCase):
1065 def test_bug1251300(self):
1066 # Decoding with unicode_internal used to not correctly handle "code
1067 # points" above 0x10ffff on UCS-4 builds.
1068 if sys.maxunicode > 0xffff:
1069 ok = [
1070 ("\x00\x10\xff\xff", u"\U0010ffff"),
1071 ("\x00\x00\x01\x01", u"\U00000101"),
1072 ("", u""),
1073 ]
1074 not_ok = [
1075 "\x7f\xff\xff\xff",
1076 "\x80\x00\x00\x00",
1077 "\x81\x00\x00\x00",
1078 "\x00",
1079 "\x00\x00\x00\x00\x00",
1080 ]
1081 for internal, uni in ok:
1082 if sys.byteorder == "little":
1083 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001084 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001085 for internal in not_ok:
1086 if sys.byteorder == "little":
1087 internal = "".join(reversed(internal))
1088 self.assertRaises(UnicodeDecodeError, internal.decode,
1089 "unicode_internal")
1090
1091 def test_decode_error_attributes(self):
1092 if sys.maxunicode > 0xffff:
1093 try:
1094 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1095 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001096 self.assertEqual("unicode_internal", ex.encoding)
1097 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1098 self.assertEqual(4, ex.start)
1099 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001100 else:
1101 self.fail()
1102
1103 def test_decode_callback(self):
1104 if sys.maxunicode > 0xffff:
1105 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1106 decoder = codecs.getdecoder("unicode_internal")
1107 ab = u"ab".encode("unicode_internal")
1108 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1109 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001110 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001111
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001112 def test_encode_length(self):
1113 # Issue 3739
1114 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001115 self.assertEqual(encoder(u"a")[1], 1)
1116 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001117
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001118 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001119 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001120
Martin v. Löwis2548c732003-04-18 10:39:54 +00001121# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1122nameprep_tests = [
1123 # 3.1 Map to nothing.
1124 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1125 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1126 '\xb8\x8f\xef\xbb\xbf',
1127 'foobarbaz'),
1128 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1129 ('CAFE',
1130 'cafe'),
1131 # 3.3 Case folding 8bit U+00DF (german sharp s).
1132 # The original test case is bogus; it says \xc3\xdf
1133 ('\xc3\x9f',
1134 'ss'),
1135 # 3.4 Case folding U+0130 (turkish capital I with dot).
1136 ('\xc4\xb0',
1137 'i\xcc\x87'),
1138 # 3.5 Case folding multibyte U+0143 U+037A.
1139 ('\xc5\x83\xcd\xba',
1140 '\xc5\x84 \xce\xb9'),
1141 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1142 # XXX: skip this as it fails in UCS-2 mode
1143 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1144 # 'telc\xe2\x88\x95kg\xcf\x83'),
1145 (None, None),
1146 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1147 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1148 '\xc7\xb0 a'),
1149 # 3.8 Case folding U+1FB7 and normalization.
1150 ('\xe1\xbe\xb7',
1151 '\xe1\xbe\xb6\xce\xb9'),
1152 # 3.9 Self-reverting case folding U+01F0 and normalization.
1153 # The original test case is bogus, it says `\xc7\xf0'
1154 ('\xc7\xb0',
1155 '\xc7\xb0'),
1156 # 3.10 Self-reverting case folding U+0390 and normalization.
1157 ('\xce\x90',
1158 '\xce\x90'),
1159 # 3.11 Self-reverting case folding U+03B0 and normalization.
1160 ('\xce\xb0',
1161 '\xce\xb0'),
1162 # 3.12 Self-reverting case folding U+1E96 and normalization.
1163 ('\xe1\xba\x96',
1164 '\xe1\xba\x96'),
1165 # 3.13 Self-reverting case folding U+1F56 and normalization.
1166 ('\xe1\xbd\x96',
1167 '\xe1\xbd\x96'),
1168 # 3.14 ASCII space character U+0020.
1169 (' ',
1170 ' '),
1171 # 3.15 Non-ASCII 8bit space character U+00A0.
1172 ('\xc2\xa0',
1173 ' '),
1174 # 3.16 Non-ASCII multibyte space character U+1680.
1175 ('\xe1\x9a\x80',
1176 None),
1177 # 3.17 Non-ASCII multibyte space character U+2000.
1178 ('\xe2\x80\x80',
1179 ' '),
1180 # 3.18 Zero Width Space U+200b.
1181 ('\xe2\x80\x8b',
1182 ''),
1183 # 3.19 Non-ASCII multibyte space character U+3000.
1184 ('\xe3\x80\x80',
1185 ' '),
1186 # 3.20 ASCII control characters U+0010 U+007F.
1187 ('\x10\x7f',
1188 '\x10\x7f'),
1189 # 3.21 Non-ASCII 8bit control character U+0085.
1190 ('\xc2\x85',
1191 None),
1192 # 3.22 Non-ASCII multibyte control character U+180E.
1193 ('\xe1\xa0\x8e',
1194 None),
1195 # 3.23 Zero Width No-Break Space U+FEFF.
1196 ('\xef\xbb\xbf',
1197 ''),
1198 # 3.24 Non-ASCII control character U+1D175.
1199 ('\xf0\x9d\x85\xb5',
1200 None),
1201 # 3.25 Plane 0 private use character U+F123.
1202 ('\xef\x84\xa3',
1203 None),
1204 # 3.26 Plane 15 private use character U+F1234.
1205 ('\xf3\xb1\x88\xb4',
1206 None),
1207 # 3.27 Plane 16 private use character U+10F234.
1208 ('\xf4\x8f\x88\xb4',
1209 None),
1210 # 3.28 Non-character code point U+8FFFE.
1211 ('\xf2\x8f\xbf\xbe',
1212 None),
1213 # 3.29 Non-character code point U+10FFFF.
1214 ('\xf4\x8f\xbf\xbf',
1215 None),
1216 # 3.30 Surrogate code U+DF42.
1217 ('\xed\xbd\x82',
1218 None),
1219 # 3.31 Non-plain text character U+FFFD.
1220 ('\xef\xbf\xbd',
1221 None),
1222 # 3.32 Ideographic description character U+2FF5.
1223 ('\xe2\xbf\xb5',
1224 None),
1225 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001226 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 '\xcc\x81'),
1228 # 3.34 Left-to-right mark U+200E.
1229 ('\xe2\x80\x8e',
1230 None),
1231 # 3.35 Deprecated U+202A.
1232 ('\xe2\x80\xaa',
1233 None),
1234 # 3.36 Language tagging character U+E0001.
1235 ('\xf3\xa0\x80\x81',
1236 None),
1237 # 3.37 Language tagging character U+E0042.
1238 ('\xf3\xa0\x81\x82',
1239 None),
1240 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1241 ('foo\xd6\xbebar',
1242 None),
1243 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1244 ('foo\xef\xb5\x90bar',
1245 None),
1246 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1247 ('foo\xef\xb9\xb6bar',
1248 'foo \xd9\x8ebar'),
1249 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1250 ('\xd8\xa71',
1251 None),
1252 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1253 ('\xd8\xa71\xd8\xa8',
1254 '\xd8\xa71\xd8\xa8'),
1255 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001256 # Skip this test as we allow unassigned
1257 #('\xf3\xa0\x80\x82',
1258 # None),
1259 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 # 3.44 Larger test (shrinking).
1261 # Original test case reads \xc3\xdf
1262 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1263 '\xaa\xce\xb0\xe2\x80\x80',
1264 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1265 # 3.45 Larger test (expanding).
1266 # Original test case reads \xc3\x9f
1267 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1268 '\x80',
1269 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1270 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1271 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1272 ]
1273
1274
1275class NameprepTest(unittest.TestCase):
1276 def test_nameprep(self):
1277 from encodings.idna import nameprep
1278 for pos, (orig, prepped) in enumerate(nameprep_tests):
1279 if orig is None:
1280 # Skipped
1281 continue
1282 # The Unicode strings are given in UTF-8
1283 orig = unicode(orig, "utf-8")
1284 if prepped is None:
1285 # Input contains prohibited characters
1286 self.assertRaises(UnicodeError, nameprep, orig)
1287 else:
1288 prepped = unicode(prepped, "utf-8")
1289 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001290 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291 except Exception,e:
1292 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1293
Walter Dörwald78a0be62006-04-14 18:25:39 +00001294class IDNACodecTest(unittest.TestCase):
1295 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001296 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1297 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1298 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1299 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001300
1301 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(u"python.org".encode("idna"), "python.org")
1303 self.assertEqual("python.org.".encode("idna"), "python.org.")
1304 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1305 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001306
Martin v. Löwis8b595142005-08-25 11:03:38 +00001307 def test_stream(self):
1308 import StringIO
1309 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1310 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001311 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001312
Walter Dörwald78a0be62006-04-14 18:25:39 +00001313 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001314 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001315 "".join(codecs.iterdecode("python.org", "idna")),
1316 u"python.org"
1317 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001318 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001319 "".join(codecs.iterdecode("python.org.", "idna")),
1320 u"python.org."
1321 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001322 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001323 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1324 u"pyth\xf6n.org."
1325 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001326 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001327 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1328 u"pyth\xf6n.org."
1329 )
1330
1331 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001332 self.assertEqual(decoder.decode("xn--xam", ), u"")
1333 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1334 self.assertEqual(decoder.decode(u"rg"), u"")
1335 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001336
1337 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001338 self.assertEqual(decoder.decode("xn--xam", ), u"")
1339 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1340 self.assertEqual(decoder.decode("rg."), u"org.")
1341 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001342
1343 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001344 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001345 "".join(codecs.iterencode(u"python.org", "idna")),
1346 "python.org"
1347 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001348 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001349 "".join(codecs.iterencode(u"python.org.", "idna")),
1350 "python.org."
1351 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001352 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001353 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1354 "xn--pythn-mua.org."
1355 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001356 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001357 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1358 "xn--pythn-mua.org."
1359 )
1360
1361 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001362 self.assertEqual(encoder.encode(u"\xe4x"), "")
1363 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1364 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001365
1366 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001367 self.assertEqual(encoder.encode(u"\xe4x"), "")
1368 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1369 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001370
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001371class CodecsModuleTest(unittest.TestCase):
1372
1373 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001374 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001375 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001376 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001377 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001378 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1379
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001380 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001381 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001382 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001383 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001384 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001385 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001386 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1387
1388 def test_register(self):
1389 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001390 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001391
1392 def test_lookup(self):
1393 self.assertRaises(TypeError, codecs.lookup)
1394 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001395 self.assertRaises(LookupError, codecs.lookup, " ")
1396
1397 def test_getencoder(self):
1398 self.assertRaises(TypeError, codecs.getencoder)
1399 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1400
1401 def test_getdecoder(self):
1402 self.assertRaises(TypeError, codecs.getdecoder)
1403 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1404
1405 def test_getreader(self):
1406 self.assertRaises(TypeError, codecs.getreader)
1407 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1408
1409 def test_getwriter(self):
1410 self.assertRaises(TypeError, codecs.getwriter)
1411 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001412
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001413 def test_lookup_issue1813(self):
1414 # Issue #1813: under Turkish locales, lookup of some codecs failed
1415 # because 'I' is lowercased as a dotless "i"
1416 oldlocale = locale.getlocale(locale.LC_CTYPE)
1417 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1418 try:
1419 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1420 except locale.Error:
1421 # Unsupported locale on this system
1422 self.skipTest('test needs Turkish locale')
1423 c = codecs.lookup('ASCII')
1424 self.assertEqual(c.name, 'ascii')
1425
Serhiy Storchaka74a651b2014-12-20 17:42:24 +02001426 def test_all(self):
1427 api = (
1428 "encode", "decode",
1429 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1430 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1431 "getencoder", "getdecoder", "getincrementalencoder",
1432 "getincrementaldecoder", "getreader", "getwriter",
1433 "register_error", "lookup_error",
1434 "strict_errors", "replace_errors", "ignore_errors",
1435 "xmlcharrefreplace_errors", "backslashreplace_errors",
1436 "open", "EncodedFile",
1437 "iterencode", "iterdecode",
1438 "BOM", "BOM_BE", "BOM_LE",
1439 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1440 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1441 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1442 "StreamReaderWriter", "StreamRecoder",
1443 )
1444 self.assertEqual(sorted(api), sorted(codecs.__all__))
1445 for api in codecs.__all__:
1446 getattr(codecs, api)
1447
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001448class StreamReaderTest(unittest.TestCase):
1449
1450 def setUp(self):
1451 self.reader = codecs.getreader('utf-8')
1452 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1453
1454 def test_readlines(self):
1455 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001456 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001457
Georg Brandl8f99f812006-10-29 08:39:22 +00001458class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001459
Georg Brandl8f99f812006-10-29 08:39:22 +00001460 def test_basic(self):
1461 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001462 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001463 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001464
1465 f = StringIO.StringIO()
1466 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1467 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001468 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001469
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001470class Str2StrTest(unittest.TestCase):
1471
1472 def test_read(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001473 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001474 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1475 sout = reader.read()
1476 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001477 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001478
1479 def test_readline(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001480 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001481 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1482 sout = reader.readline()
1483 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001484 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001485
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001486all_unicode_encodings = [
1487 "ascii",
1488 "base64_codec",
1489 "big5",
1490 "big5hkscs",
1491 "charmap",
1492 "cp037",
1493 "cp1006",
1494 "cp1026",
1495 "cp1140",
1496 "cp1250",
1497 "cp1251",
1498 "cp1252",
1499 "cp1253",
1500 "cp1254",
1501 "cp1255",
1502 "cp1256",
1503 "cp1257",
1504 "cp1258",
1505 "cp424",
1506 "cp437",
1507 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001508 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001509 "cp737",
1510 "cp775",
1511 "cp850",
1512 "cp852",
1513 "cp855",
1514 "cp856",
1515 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001516 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001517 "cp860",
1518 "cp861",
1519 "cp862",
1520 "cp863",
1521 "cp864",
1522 "cp865",
1523 "cp866",
1524 "cp869",
1525 "cp874",
1526 "cp875",
1527 "cp932",
1528 "cp949",
1529 "cp950",
1530 "euc_jis_2004",
1531 "euc_jisx0213",
1532 "euc_jp",
1533 "euc_kr",
1534 "gb18030",
1535 "gb2312",
1536 "gbk",
1537 "hex_codec",
1538 "hp_roman8",
1539 "hz",
1540 "idna",
1541 "iso2022_jp",
1542 "iso2022_jp_1",
1543 "iso2022_jp_2",
1544 "iso2022_jp_2004",
1545 "iso2022_jp_3",
1546 "iso2022_jp_ext",
1547 "iso2022_kr",
1548 "iso8859_1",
1549 "iso8859_10",
1550 "iso8859_11",
1551 "iso8859_13",
1552 "iso8859_14",
1553 "iso8859_15",
1554 "iso8859_16",
1555 "iso8859_2",
1556 "iso8859_3",
1557 "iso8859_4",
1558 "iso8859_5",
1559 "iso8859_6",
1560 "iso8859_7",
1561 "iso8859_8",
1562 "iso8859_9",
1563 "johab",
1564 "koi8_r",
1565 "koi8_u",
1566 "latin_1",
1567 "mac_cyrillic",
1568 "mac_greek",
1569 "mac_iceland",
1570 "mac_latin2",
1571 "mac_roman",
1572 "mac_turkish",
1573 "palmos",
1574 "ptcp154",
1575 "punycode",
1576 "raw_unicode_escape",
1577 "rot_13",
1578 "shift_jis",
1579 "shift_jis_2004",
1580 "shift_jisx0213",
1581 "tis_620",
1582 "unicode_escape",
1583 "unicode_internal",
1584 "utf_16",
1585 "utf_16_be",
1586 "utf_16_le",
1587 "utf_7",
1588 "utf_8",
1589]
1590
1591if hasattr(codecs, "mbcs_encode"):
1592 all_unicode_encodings.append("mbcs")
1593
1594# The following encodings work only with str, not unicode
1595all_string_encodings = [
1596 "quopri_codec",
1597 "string_escape",
1598 "uu_codec",
1599]
1600
1601# The following encoding is not tested, because it's not supposed
1602# to work:
1603# "undefined"
1604
1605# The following encodings don't work in stateful mode
1606broken_unicode_with_streams = [
1607 "base64_codec",
1608 "hex_codec",
1609 "punycode",
1610 "unicode_internal"
1611]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001612broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001613
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001614if sys.flags.py3k_warning:
1615 broken_unicode_with_streams.append("rot_13")
1616
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001617# The following encodings only support "strict" mode
1618only_strict_mode = [
1619 "idna",
1620 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001621 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001622]
1623
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001624try:
1625 import bz2
1626except ImportError:
1627 pass
1628else:
1629 all_unicode_encodings.append("bz2_codec")
1630 broken_unicode_with_streams.append("bz2_codec")
1631
1632try:
1633 import zlib
1634except ImportError:
1635 pass
1636else:
1637 all_unicode_encodings.append("zlib_codec")
1638 broken_unicode_with_streams.append("zlib_codec")
1639
1640class BasicUnicodeTest(unittest.TestCase):
1641 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001642 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001643 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001644 name = codecs.lookup(encoding).name
1645 if encoding.endswith("_codec"):
1646 name += "_codec"
1647 elif encoding == "latin_1":
1648 name = "latin_1"
1649 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001650 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001651 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001652 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001653 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001654
1655 if encoding not in broken_unicode_with_streams:
1656 # check stream reader/writer
1657 q = Queue()
1658 writer = codecs.getwriter(encoding)(q)
1659 encodedresult = ""
1660 for c in s:
1661 writer.write(c)
1662 encodedresult += q.read()
1663 q = Queue()
1664 reader = codecs.getreader(encoding)(q)
1665 decodedresult = u""
1666 for c in encodedresult:
1667 q.write(c)
1668 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001669 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001670
Georg Brandl2c9838e2006-10-29 14:39:09 +00001671 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001672 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001673 try:
1674 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001675 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001676 pass
1677 else:
1678 # check incremental decoder/encoder
1679 encodedresult = ""
1680 for c in s:
1681 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001682 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001683 decoder = codecs.getincrementaldecoder(encoding)()
1684 decodedresult = u""
1685 for c in encodedresult:
1686 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001687 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001688 self.assertEqual(decodedresult, s,
1689 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001690
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001691 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001692 result = u"".join(codecs.iterdecode(
1693 codecs.iterencode(s, encoding), encoding))
1694 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001695
1696 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001697 result = u"".join(codecs.iterdecode(
1698 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001699 self.assertEqual(result, u"")
1700
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001701 if encoding not in only_strict_mode:
1702 # check incremental decoder/encoder with errors argument
1703 try:
1704 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001705 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001706 pass
1707 else:
1708 encodedresult = "".join(encoder.encode(c) for c in s)
1709 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001710 decodedresult = u"".join(decoder.decode(c)
1711 for c in encodedresult)
1712 self.assertEqual(decodedresult, s,
1713 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001714
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001715 @test_support.cpython_only
1716 def test_basics_capi(self):
1717 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1718 s = u"abc123" # all codecs should be able to encode these
1719 for encoding in all_unicode_encodings:
1720 if encoding not in broken_incremental_coders:
1721 # check incremental decoder/encoder and iterencode()/iterdecode()
1722 try:
1723 cencoder = codec_incrementalencoder(encoding)
1724 except LookupError: # no IncrementalEncoder
1725 pass
1726 else:
1727 # check C API
1728 encodedresult = ""
1729 for c in s:
1730 encodedresult += cencoder.encode(c)
1731 encodedresult += cencoder.encode(u"", True)
1732 cdecoder = codec_incrementaldecoder(encoding)
1733 decodedresult = u""
1734 for c in encodedresult:
1735 decodedresult += cdecoder.decode(c)
1736 decodedresult += cdecoder.decode("", True)
1737 self.assertEqual(decodedresult, s,
1738 "encoding=%r" % encoding)
1739
1740 if encoding not in only_strict_mode:
1741 # check incremental decoder/encoder with errors argument
1742 try:
1743 cencoder = codec_incrementalencoder(encoding, "ignore")
1744 except LookupError: # no IncrementalEncoder
1745 pass
1746 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001747 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001748 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1749 decodedresult = u"".join(cdecoder.decode(c)
1750 for c in encodedresult)
1751 self.assertEqual(decodedresult, s,
1752 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001753
Walter Dörwald729c31f2005-03-14 19:06:30 +00001754 def test_seek(self):
1755 # all codecs should be able to encode these
1756 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1757 for encoding in all_unicode_encodings:
1758 if encoding == "idna": # FIXME: See SF bug #1163178
1759 continue
1760 if encoding in broken_unicode_with_streams:
1761 continue
1762 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1763 for t in xrange(5):
1764 # Test that calling seek resets the internal codec state and buffers
1765 reader.seek(0, 0)
1766 line = reader.readline()
1767 self.assertEqual(s[:len(line)], line)
1768
Walter Dörwalde22d3392005-11-17 08:52:34 +00001769 def test_bad_decode_args(self):
1770 for encoding in all_unicode_encodings:
1771 decoder = codecs.getdecoder(encoding)
1772 self.assertRaises(TypeError, decoder)
1773 if encoding not in ("idna", "punycode"):
1774 self.assertRaises(TypeError, decoder, 42)
1775
1776 def test_bad_encode_args(self):
1777 for encoding in all_unicode_encodings:
1778 encoder = codecs.getencoder(encoding)
1779 self.assertRaises(TypeError, encoder)
1780
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001781 def test_encoding_map_type_initialized(self):
1782 from encodings import cp1140
1783 # This used to crash, we are only verifying there's no crash.
1784 table_type = type(cp1140.encoding_table)
1785 self.assertEqual(table_type, table_type)
1786
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001787class BasicStrTest(unittest.TestCase):
1788 def test_basics(self):
1789 s = "abc123"
1790 for encoding in all_string_encodings:
1791 (bytes, size) = codecs.getencoder(encoding)(s)
1792 self.assertEqual(size, len(s))
1793 (chars, size) = codecs.getdecoder(encoding)(bytes)
1794 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1795
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001796class CharmapTest(unittest.TestCase):
1797 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001798 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001799 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1800 (u"abc", 3)
1801 )
1802
Serhiy Storchaka95997452013-01-15 14:42:59 +02001803 self.assertRaises(UnicodeDecodeError,
1804 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1805 )
1806
1807 self.assertRaises(UnicodeDecodeError,
1808 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1809 )
1810
Ezio Melotti2623a372010-11-21 13:34:58 +00001811 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001812 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1813 (u"ab\ufffd", 3)
1814 )
1815
Ezio Melotti2623a372010-11-21 13:34:58 +00001816 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001817 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1818 (u"ab\ufffd", 3)
1819 )
1820
Ezio Melotti2623a372010-11-21 13:34:58 +00001821 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001822 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1823 (u"ab", 3)
1824 )
1825
Ezio Melotti2623a372010-11-21 13:34:58 +00001826 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001827 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1828 (u"ab", 3)
1829 )
1830
1831 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001832 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001833 codecs.charmap_decode(allbytes, "ignore", u""),
1834 (u"", len(allbytes))
1835 )
1836
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001837 def test_decode_with_int2str_map(self):
1838 self.assertEqual(
1839 codecs.charmap_decode("\x00\x01\x02", "strict",
1840 {0: u'a', 1: u'b', 2: u'c'}),
1841 (u"abc", 3)
1842 )
1843
1844 self.assertEqual(
1845 codecs.charmap_decode("\x00\x01\x02", "strict",
1846 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1847 (u"AaBbCc", 3)
1848 )
1849
1850 self.assertEqual(
1851 codecs.charmap_decode("\x00\x01\x02", "strict",
1852 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1853 (u"\U0010FFFFbc", 3)
1854 )
1855
1856 self.assertEqual(
1857 codecs.charmap_decode("\x00\x01\x02", "strict",
1858 {0: u'a', 1: u'b', 2: u''}),
1859 (u"ab", 3)
1860 )
1861
1862 self.assertRaises(UnicodeDecodeError,
1863 codecs.charmap_decode, "\x00\x01\x02", "strict",
1864 {0: u'a', 1: u'b'}
1865 )
1866
Serhiy Storchaka95997452013-01-15 14:42:59 +02001867 self.assertRaises(UnicodeDecodeError,
1868 codecs.charmap_decode, "\x00\x01\x02", "strict",
1869 {0: u'a', 1: u'b', 2: None}
1870 )
1871
1872 # Issue #14850
1873 self.assertRaises(UnicodeDecodeError,
1874 codecs.charmap_decode, "\x00\x01\x02", "strict",
1875 {0: u'a', 1: u'b', 2: u'\ufffe'}
1876 )
1877
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001878 self.assertEqual(
1879 codecs.charmap_decode("\x00\x01\x02", "replace",
1880 {0: u'a', 1: u'b'}),
1881 (u"ab\ufffd", 3)
1882 )
1883
1884 self.assertEqual(
1885 codecs.charmap_decode("\x00\x01\x02", "replace",
1886 {0: u'a', 1: u'b', 2: None}),
1887 (u"ab\ufffd", 3)
1888 )
1889
Serhiy Storchaka95997452013-01-15 14:42:59 +02001890 # Issue #14850
1891 self.assertEqual(
1892 codecs.charmap_decode("\x00\x01\x02", "replace",
1893 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1894 (u"ab\ufffd", 3)
1895 )
1896
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001897 self.assertEqual(
1898 codecs.charmap_decode("\x00\x01\x02", "ignore",
1899 {0: u'a', 1: u'b'}),
1900 (u"ab", 3)
1901 )
1902
1903 self.assertEqual(
1904 codecs.charmap_decode("\x00\x01\x02", "ignore",
1905 {0: u'a', 1: u'b', 2: None}),
1906 (u"ab", 3)
1907 )
1908
Serhiy Storchaka95997452013-01-15 14:42:59 +02001909 # Issue #14850
1910 self.assertEqual(
1911 codecs.charmap_decode("\x00\x01\x02", "ignore",
1912 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1913 (u"ab", 3)
1914 )
1915
1916 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001917 self.assertEqual(
1918 codecs.charmap_decode(allbytes, "ignore", {}),
1919 (u"", len(allbytes))
1920 )
1921
1922 def test_decode_with_int2int_map(self):
1923 a = ord(u'a')
1924 b = ord(u'b')
1925 c = ord(u'c')
1926
1927 self.assertEqual(
1928 codecs.charmap_decode("\x00\x01\x02", "strict",
1929 {0: a, 1: b, 2: c}),
1930 (u"abc", 3)
1931 )
1932
1933 # Issue #15379
1934 self.assertEqual(
1935 codecs.charmap_decode("\x00\x01\x02", "strict",
1936 {0: 0x10FFFF, 1: b, 2: c}),
1937 (u"\U0010FFFFbc", 3)
1938 )
1939
1940 self.assertRaises(TypeError,
1941 codecs.charmap_decode, "\x00\x01\x02", "strict",
1942 {0: 0x110000, 1: b, 2: c}
1943 )
1944
1945 self.assertRaises(UnicodeDecodeError,
1946 codecs.charmap_decode, "\x00\x01\x02", "strict",
1947 {0: a, 1: b},
1948 )
1949
Serhiy Storchaka95997452013-01-15 14:42:59 +02001950 self.assertRaises(UnicodeDecodeError,
1951 codecs.charmap_decode, "\x00\x01\x02", "strict",
1952 {0: a, 1: b, 2: 0xFFFE},
1953 )
1954
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001955 self.assertEqual(
1956 codecs.charmap_decode("\x00\x01\x02", "replace",
1957 {0: a, 1: b}),
1958 (u"ab\ufffd", 3)
1959 )
1960
1961 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001962 codecs.charmap_decode("\x00\x01\x02", "replace",
1963 {0: a, 1: b, 2: 0xFFFE}),
1964 (u"ab\ufffd", 3)
1965 )
1966
1967 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001968 codecs.charmap_decode("\x00\x01\x02", "ignore",
1969 {0: a, 1: b}),
1970 (u"ab", 3)
1971 )
1972
Serhiy Storchaka95997452013-01-15 14:42:59 +02001973 self.assertEqual(
1974 codecs.charmap_decode("\x00\x01\x02", "ignore",
1975 {0: a, 1: b, 2: 0xFFFE}),
1976 (u"ab", 3)
1977 )
1978
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001979
Georg Brandl8f99f812006-10-29 08:39:22 +00001980class WithStmtTest(unittest.TestCase):
1981 def test_encodedfile(self):
1982 f = StringIO.StringIO("\xc3\xbc")
1983 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001984 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001985
1986 def test_streamreaderwriter(self):
1987 f = StringIO.StringIO("\xc3\xbc")
1988 info = codecs.lookup("utf-8")
1989 with codecs.StreamReaderWriter(f, info.streamreader,
1990 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001991 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001992
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001993
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001994class UnicodeEscapeTest(unittest.TestCase):
1995 def test_empty(self):
1996 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1997 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1998
1999 def test_raw_encode(self):
2000 encode = codecs.unicode_escape_encode
2001 for b in range(32, 127):
2002 if b != ord('\\'):
2003 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2004
2005 def test_raw_decode(self):
2006 decode = codecs.unicode_escape_decode
2007 for b in range(256):
2008 if b != ord('\\'):
2009 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2010
2011 def test_escape_encode(self):
2012 encode = codecs.unicode_escape_encode
2013 check = coding_checker(self, encode)
2014 check(u'\t', r'\t')
2015 check(u'\n', r'\n')
2016 check(u'\r', r'\r')
2017 check(u'\\', r'\\')
2018 for b in range(32):
2019 if chr(b) not in '\t\n\r':
2020 check(unichr(b), '\\x%02x' % b)
2021 for b in range(127, 256):
2022 check(unichr(b), '\\x%02x' % b)
2023 check(u'\u20ac', r'\u20ac')
2024 check(u'\U0001d120', r'\U0001d120')
2025
2026 def test_escape_decode(self):
2027 decode = codecs.unicode_escape_decode
2028 check = coding_checker(self, decode)
2029 check("[\\\n]", u"[]")
2030 check(r'[\"]', u'["]')
2031 check(r"[\']", u"[']")
2032 check(r"[\\]", ur"[\]")
2033 check(r"[\a]", u"[\x07]")
2034 check(r"[\b]", u"[\x08]")
2035 check(r"[\t]", u"[\x09]")
2036 check(r"[\n]", u"[\x0a]")
2037 check(r"[\v]", u"[\x0b]")
2038 check(r"[\f]", u"[\x0c]")
2039 check(r"[\r]", u"[\x0d]")
2040 check(r"[\7]", u"[\x07]")
2041 check(r"[\8]", ur"[\8]")
2042 check(r"[\78]", u"[\x078]")
2043 check(r"[\41]", u"[!]")
2044 check(r"[\418]", u"[!8]")
2045 check(r"[\101]", u"[A]")
2046 check(r"[\1010]", u"[A0]")
2047 check(r"[\x41]", u"[A]")
2048 check(r"[\x410]", u"[A0]")
2049 check(r"\u20ac", u"\u20ac")
2050 check(r"\U0001d120", u"\U0001d120")
2051 for b in range(256):
2052 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
2053 check('\\' + chr(b), u'\\' + unichr(b))
2054
2055 def test_decode_errors(self):
2056 decode = codecs.unicode_escape_decode
2057 for c, d in ('x', 2), ('u', 4), ('U', 4):
2058 for i in range(d):
2059 self.assertRaises(UnicodeDecodeError, decode,
2060 "\\" + c + "0"*i)
2061 self.assertRaises(UnicodeDecodeError, decode,
2062 "[\\" + c + "0"*i + "]")
2063 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2064 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2065 self.assertEqual(decode(data, "replace"),
2066 (u"[\ufffd]\ufffd", len(data)))
2067 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2068 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2069 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2070
2071
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002072class RawUnicodeEscapeTest(unittest.TestCase):
2073 def test_empty(self):
2074 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
2075 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
2076
2077 def test_raw_encode(self):
2078 encode = codecs.raw_unicode_escape_encode
2079 for b in range(256):
2080 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2081
2082 def test_raw_decode(self):
2083 decode = codecs.raw_unicode_escape_decode
2084 for b in range(256):
2085 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2086
2087 def test_escape_encode(self):
2088 encode = codecs.raw_unicode_escape_encode
2089 check = coding_checker(self, encode)
2090 for b in range(256):
2091 if chr(b) not in 'uU':
2092 check(u'\\' + unichr(b), '\\' + chr(b))
2093 check(u'\u20ac', r'\u20ac')
2094 check(u'\U0001d120', r'\U0001d120')
2095
2096 def test_escape_decode(self):
2097 decode = codecs.raw_unicode_escape_decode
2098 check = coding_checker(self, decode)
2099 for b in range(256):
2100 if chr(b) not in 'uU':
2101 check('\\' + chr(b), u'\\' + unichr(b))
2102 check(r"\u20ac", u"\u20ac")
2103 check(r"\U0001d120", u"\U0001d120")
2104
2105 def test_decode_errors(self):
2106 decode = codecs.raw_unicode_escape_decode
2107 for c, d in ('u', 4), ('U', 4):
2108 for i in range(d):
2109 self.assertRaises(UnicodeDecodeError, decode,
2110 "\\" + c + "0"*i)
2111 self.assertRaises(UnicodeDecodeError, decode,
2112 "[\\" + c + "0"*i + "]")
2113 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2114 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2115 self.assertEqual(decode(data, "replace"),
2116 (u"[\ufffd]\ufffd", len(data)))
2117 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2118 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2119 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2120
2121
Victor Stinner262be5e2010-05-22 02:11:07 +00002122class BomTest(unittest.TestCase):
2123 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002124 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002125 tests = ("utf-16",
2126 "utf-16-le",
2127 "utf-16-be",
2128 "utf-32",
2129 "utf-32-le",
2130 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002131 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002132 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002133 # Check if the BOM is written only once
2134 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002135 f.write(data)
2136 f.write(data)
2137 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002138 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002139 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002140 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002141
Victor Stinner7df55da2010-05-22 13:37:56 +00002142 # Check that the BOM is written after a seek(0)
2143 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2144 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002145 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002146 f.seek(0)
2147 f.write(data)
2148 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002149 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002150
2151 # (StreamWriter) Check that the BOM is written after a seek(0)
2152 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2153 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002154 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002155 f.writer.seek(0)
2156 f.writer.write(data)
2157 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002158 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002159
2160 # Check that the BOM is not written after a seek() at a position
2161 # different than the start
2162 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2163 f.write(data)
2164 f.seek(f.tell())
2165 f.write(data)
2166 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002167 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002168
2169 # (StreamWriter) Check that the BOM is not written after a seek()
2170 # at a position different than the start
2171 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2172 f.writer.write(data)
2173 f.writer.seek(f.writer.tell())
2174 f.writer.write(data)
2175 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002176 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002177
Victor Stinner262be5e2010-05-22 02:11:07 +00002178
Martin Panter90bc71f2015-09-12 02:20:06 +00002179class TransformCodecTest(unittest.TestCase):
2180
Martin Panterb2528c92015-09-12 00:34:28 +00002181 def test_quopri_stateless(self):
2182 # Should encode with quotetabs=True
2183 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2184 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2185 # But should still support unescaped tabs and spaces
2186 unescaped = b"space tab eol\n"
2187 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2188
Martin Panter90bc71f2015-09-12 02:20:06 +00002189 def test_uu_invalid(self):
2190 # Missing "begin" line
2191 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2192
2193
Fred Drake2e2be372001-09-20 21:33:42 +00002194def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002195 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002196 UTF32Test,
2197 UTF32LETest,
2198 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002199 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002200 UTF16LETest,
2201 UTF16BETest,
2202 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002203 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002204 UTF7Test,
2205 UTF16ExTest,
2206 ReadBufferTest,
2207 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002208 EscapeDecodeTest,
2209 RecodingTest,
2210 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002211 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002212 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002213 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002214 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002215 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002216 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002217 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002218 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002219 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002220 CharmapTest,
2221 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002222 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002223 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002224 BomTest,
Martin Panter90bc71f2015-09-12 02:20:06 +00002225 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002226 )
Fred Drake2e2be372001-09-20 21:33:42 +00002227
2228
2229if __name__ == "__main__":
2230 test_main()