blob: 9f3b17a7c57eb4bca91a16fd0a3a4306c6d8e1ab [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
576 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
577 raw, 'strict', True)
578 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200585 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
587 u"",
588 u"\x00",
589 u"\x00",
590 u"\x00\xff",
591 u"\x00\xff",
592 u"\x00\xff\u0100",
593 u"\x00\xff\u0100",
594 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200595 u"\x00\xff\u0100\uffff",
596 u"\x00\xff\u0100\uffff",
597 u"\x00\xff\u0100\uffff",
598 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwalde22d3392005-11-17 08:52:34 +0000602 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200603 tests = [
604 (b'\xff', u'\ufffd'),
605 (b'\x00A\xff', u'A\ufffd'),
606 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
607 (b'\xd8\x00', u'\ufffd'),
608 (b'\xd8\x00\xdc', u'\ufffd'),
609 (b'\xd8\x00\x00A', u'\ufffdA'),
610 (b'\xdc\x00\x00A', u'\ufffdA'),
611 ]
612 for raw, expected in tests:
613 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
614 raw, 'strict', True)
615 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000617class UTF8Test(ReadTest):
618 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000619
620 def test_partial(self):
621 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200622 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 [
624 u"\x00",
625 u"\x00",
626 u"\x00\xff",
627 u"\x00\xff",
628 u"\x00\xff\u07ff",
629 u"\x00\xff\u07ff",
630 u"\x00\xff\u07ff",
631 u"\x00\xff\u07ff\u0800",
632 u"\x00\xff\u07ff\u0800",
633 u"\x00\xff\u07ff\u0800",
634 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200635 u"\x00\xff\u07ff\u0800\uffff",
636 u"\x00\xff\u07ff\u0800\uffff",
637 u"\x00\xff\u07ff\u0800\uffff",
638 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 ]
640 )
641
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642class UTF7Test(ReadTest):
643 encoding = "utf-7"
644
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000645 def test_partial(self):
646 self.check_partial(
647 u"a+-b",
648 [
649 u"a",
650 u"a",
651 u"a+",
652 u"a+-",
653 u"a+-b",
654 ]
655 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000656
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300657 def test_errors(self):
658 tests = [
659 ('a\xffb', u'a\ufffdb'),
660 ('a+IK', u'a\ufffd'),
661 ('a+IK-b', u'a\ufffdb'),
662 ('a+IK,b', u'a\ufffdb'),
663 ('a+IKx', u'a\u20ac\ufffd'),
664 ('a+IKx-b', u'a\u20ac\ufffdb'),
665 ('a+IKwgr', u'a\u20ac\ufffd'),
666 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
667 ('a+IKwgr,', u'a\u20ac\ufffd'),
668 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
669 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
670 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
671 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
672 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
673 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
674 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
675 ]
676 for raw, expected in tests:
677 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
678 raw, 'strict', True)
679 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
680
681 def test_nonbmp(self):
682 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
683 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
684 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF16ExTest(unittest.TestCase):
687
688 def test_errors(self):
689 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
690
691 def test_bad_args(self):
692 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
693
694class ReadBufferTest(unittest.TestCase):
695
696 def test_array(self):
697 import array
698 self.assertEqual(
699 codecs.readbuffer_encode(array.array("c", "spam")),
700 ("spam", 4)
701 )
702
703 def test_empty(self):
704 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.readbuffer_encode)
708 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
709
710class CharBufferTest(unittest.TestCase):
711
712 def test_string(self):
713 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
714
715 def test_empty(self):
716 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
717
718 def test_bad_args(self):
719 self.assertRaises(TypeError, codecs.charbuffer_encode)
720 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
721
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000722class UTF8SigTest(ReadTest):
723 encoding = "utf-8-sig"
724
725 def test_partial(self):
726 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200727 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000728 [
729 u"",
730 u"",
731 u"", # First BOM has been read and skipped
732 u"",
733 u"",
734 u"\ufeff", # Second BOM has been read and emitted
735 u"\ufeff\x00", # "\x00" read and emitted
736 u"\ufeff\x00", # First byte of encoded u"\xff" read
737 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
738 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
739 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
740 u"\ufeff\x00\xff\u07ff",
741 u"\ufeff\x00\xff\u07ff",
742 u"\ufeff\x00\xff\u07ff\u0800",
743 u"\ufeff\x00\xff\u07ff\u0800",
744 u"\ufeff\x00\xff\u07ff\u0800",
745 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200746 u"\ufeff\x00\xff\u07ff\u0800\uffff",
747 u"\ufeff\x00\xff\u07ff\u0800\uffff",
748 u"\ufeff\x00\xff\u07ff\u0800\uffff",
749 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000750 ]
751 )
752
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000753 def test_bug1601501(self):
754 # SF bug #1601501: check that the codec works with a buffer
755 unicode("\xef\xbb\xbf", "utf-8-sig")
756
Walter Dörwald42348272007-04-12 10:35:00 +0000757 def test_bom(self):
758 d = codecs.getincrementaldecoder("utf-8-sig")()
759 s = u"spam"
760 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
761
Walter Dörwald183744d2007-11-19 12:41:10 +0000762 def test_stream_bom(self):
763 unistring = u"ABC\u00A1\u2200XYZ"
764 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
765
766 reader = codecs.getreader("utf-8-sig")
767 for sizehint in [None] + range(1, 11) + \
768 [64, 128, 256, 512, 1024]:
769 istream = reader(StringIO.StringIO(bytestring))
770 ostream = StringIO.StringIO()
771 while 1:
772 if sizehint is not None:
773 data = istream.read(sizehint)
774 else:
775 data = istream.read()
776
777 if not data:
778 break
779 ostream.write(data)
780
781 got = ostream.getvalue()
782 self.assertEqual(got, unistring)
783
784 def test_stream_bare(self):
785 unistring = u"ABC\u00A1\u2200XYZ"
786 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
787
788 reader = codecs.getreader("utf-8-sig")
789 for sizehint in [None] + range(1, 11) + \
790 [64, 128, 256, 512, 1024]:
791 istream = reader(StringIO.StringIO(bytestring))
792 ostream = StringIO.StringIO()
793 while 1:
794 if sizehint is not None:
795 data = istream.read(sizehint)
796 else:
797 data = istream.read()
798
799 if not data:
800 break
801 ostream.write(data)
802
803 got = ostream.getvalue()
804 self.assertEqual(got, unistring)
805
Walter Dörwald8709a422002-09-03 13:53:40 +0000806class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000807 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000808 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000809
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200810 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200811 decode = codecs.escape_decode
812 for b in range(256):
813 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200814 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200815 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200816
817 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200818 decode = codecs.escape_decode
819 check = coding_checker(self, decode)
820 check(b"[\\\n]", b"[]")
821 check(br'[\"]', b'["]')
822 check(br"[\']", b"[']")
823 check(br"[\\]", br"[\]")
824 check(br"[\a]", b"[\x07]")
825 check(br"[\b]", b"[\x08]")
826 check(br"[\t]", b"[\x09]")
827 check(br"[\n]", b"[\x0a]")
828 check(br"[\v]", b"[\x0b]")
829 check(br"[\f]", b"[\x0c]")
830 check(br"[\r]", b"[\x0d]")
831 check(br"[\7]", b"[\x07]")
832 check(br"[\8]", br"[\8]")
833 check(br"[\78]", b"[\x078]")
834 check(br"[\41]", b"[!]")
835 check(br"[\418]", b"[!8]")
836 check(br"[\101]", b"[A]")
837 check(br"[\1010]", b"[A0]")
838 check(br"[\501]", b"[A]")
839 check(br"[\x41]", b"[A]")
840 check(br"[\X41]", br"[\X41]")
841 check(br"[\x410]", b"[A0]")
842 for b in range(256):
843 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200844 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200845 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200846
847 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200848 decode = codecs.escape_decode
849 self.assertRaises(ValueError, decode, br"\x")
850 self.assertRaises(ValueError, decode, br"[\x]")
851 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
852 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
853 self.assertRaises(ValueError, decode, br"\x0")
854 self.assertRaises(ValueError, decode, br"[\x0]")
855 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
856 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200857
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000858class RecodingTest(unittest.TestCase):
859 def test_recoding(self):
860 f = StringIO.StringIO()
861 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
862 f2.write(u"a")
863 f2.close()
864 # Python used to crash on this at exit because of a refcount
865 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000866
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867# From RFC 3492
868punycode_testcases = [
869 # A Arabic (Egyptian):
870 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
871 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
872 "egbpdaj6bu4bxfgehfvwxn"),
873 # B Chinese (simplified):
874 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
875 "ihqwcrb4cv8a8dqg056pqjye"),
876 # C Chinese (traditional):
877 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
878 "ihqwctvzc91f659drss3x8bo0yb"),
879 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
880 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
881 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
882 u"\u0065\u0073\u006B\u0079",
883 "Proprostnemluvesky-uyb24dma41a"),
884 # E Hebrew:
885 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
886 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
887 u"\u05D1\u05E8\u05D9\u05EA",
888 "4dbcagdahymbxekheh6e0a7fei0b"),
889 # F Hindi (Devanagari):
890 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
891 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
892 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
893 u"\u0939\u0948\u0902",
894 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
895
896 #(G) Japanese (kanji and hiragana):
897 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
898 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
899 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
900
901 # (H) Korean (Hangul syllables):
902 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
903 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
904 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
905 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
906 "psd879ccm6fea98c"),
907
908 # (I) Russian (Cyrillic):
909 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
910 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
911 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
912 u"\u0438",
913 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
914
915 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
916 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
917 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
918 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
919 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
920 u"\u0061\u00F1\u006F\u006C",
921 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
922
923 # (K) Vietnamese:
924 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
925 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
926 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
927 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
928 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
929 u"\u0056\u0069\u1EC7\u0074",
930 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932 #(L) 3<nen>B<gumi><kinpachi><sensei>
933 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
934 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000935
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936 # (M) <amuro><namie>-with-SUPER-MONKEYS
937 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
938 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
939 u"\u004F\u004E\u004B\u0045\u0059\u0053",
940 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
941
942 # (N) Hello-Another-Way-<sorezore><no><basho>
943 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
944 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
945 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
946 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
947
948 # (O) <hitotsu><yane><no><shita>2
949 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
950 "2-u9tlzr9756bt3uc0v"),
951
952 # (P) Maji<de>Koi<suru>5<byou><mae>
953 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
954 u"\u308B\u0035\u79D2\u524D",
955 "MajiKoi5-783gue6qz075azm5e"),
956
957 # (Q) <pafii>de<runba>
958 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
959 "de-jg4avhby1noc0d"),
960
961 # (R) <sono><supiido><de>
962 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
963 "d9juau41awczczp"),
964
965 # (S) -> $1.00 <-
966 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
967 u"\u003C\u002D",
968 "-> $1.00 <--")
969 ]
970
971for i in punycode_testcases:
972 if len(i)!=2:
973 print repr(i)
974
975class PunycodeTest(unittest.TestCase):
976 def test_encode(self):
977 for uni, puny in punycode_testcases:
978 # Need to convert both strings to lower case, since
979 # some of the extended encodings use upper case, but our
980 # code produces only lower case. Converting just puny to
981 # lower is also insufficient, since some of the input characters
982 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000983 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984
985 def test_decode(self):
986 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000987 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000989class UnicodeInternalTest(unittest.TestCase):
990 def test_bug1251300(self):
991 # Decoding with unicode_internal used to not correctly handle "code
992 # points" above 0x10ffff on UCS-4 builds.
993 if sys.maxunicode > 0xffff:
994 ok = [
995 ("\x00\x10\xff\xff", u"\U0010ffff"),
996 ("\x00\x00\x01\x01", u"\U00000101"),
997 ("", u""),
998 ]
999 not_ok = [
1000 "\x7f\xff\xff\xff",
1001 "\x80\x00\x00\x00",
1002 "\x81\x00\x00\x00",
1003 "\x00",
1004 "\x00\x00\x00\x00\x00",
1005 ]
1006 for internal, uni in ok:
1007 if sys.byteorder == "little":
1008 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001009 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001010 for internal in not_ok:
1011 if sys.byteorder == "little":
1012 internal = "".join(reversed(internal))
1013 self.assertRaises(UnicodeDecodeError, internal.decode,
1014 "unicode_internal")
1015
1016 def test_decode_error_attributes(self):
1017 if sys.maxunicode > 0xffff:
1018 try:
1019 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1020 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001021 self.assertEqual("unicode_internal", ex.encoding)
1022 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1023 self.assertEqual(4, ex.start)
1024 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001025 else:
1026 self.fail()
1027
1028 def test_decode_callback(self):
1029 if sys.maxunicode > 0xffff:
1030 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1031 decoder = codecs.getdecoder("unicode_internal")
1032 ab = u"ab".encode("unicode_internal")
1033 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1034 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001037 def test_encode_length(self):
1038 # Issue 3739
1039 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001040 self.assertEqual(encoder(u"a")[1], 1)
1041 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001042
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001043 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001044 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001045
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1047nameprep_tests = [
1048 # 3.1 Map to nothing.
1049 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1050 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1051 '\xb8\x8f\xef\xbb\xbf',
1052 'foobarbaz'),
1053 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1054 ('CAFE',
1055 'cafe'),
1056 # 3.3 Case folding 8bit U+00DF (german sharp s).
1057 # The original test case is bogus; it says \xc3\xdf
1058 ('\xc3\x9f',
1059 'ss'),
1060 # 3.4 Case folding U+0130 (turkish capital I with dot).
1061 ('\xc4\xb0',
1062 'i\xcc\x87'),
1063 # 3.5 Case folding multibyte U+0143 U+037A.
1064 ('\xc5\x83\xcd\xba',
1065 '\xc5\x84 \xce\xb9'),
1066 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1067 # XXX: skip this as it fails in UCS-2 mode
1068 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1069 # 'telc\xe2\x88\x95kg\xcf\x83'),
1070 (None, None),
1071 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1072 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1073 '\xc7\xb0 a'),
1074 # 3.8 Case folding U+1FB7 and normalization.
1075 ('\xe1\xbe\xb7',
1076 '\xe1\xbe\xb6\xce\xb9'),
1077 # 3.9 Self-reverting case folding U+01F0 and normalization.
1078 # The original test case is bogus, it says `\xc7\xf0'
1079 ('\xc7\xb0',
1080 '\xc7\xb0'),
1081 # 3.10 Self-reverting case folding U+0390 and normalization.
1082 ('\xce\x90',
1083 '\xce\x90'),
1084 # 3.11 Self-reverting case folding U+03B0 and normalization.
1085 ('\xce\xb0',
1086 '\xce\xb0'),
1087 # 3.12 Self-reverting case folding U+1E96 and normalization.
1088 ('\xe1\xba\x96',
1089 '\xe1\xba\x96'),
1090 # 3.13 Self-reverting case folding U+1F56 and normalization.
1091 ('\xe1\xbd\x96',
1092 '\xe1\xbd\x96'),
1093 # 3.14 ASCII space character U+0020.
1094 (' ',
1095 ' '),
1096 # 3.15 Non-ASCII 8bit space character U+00A0.
1097 ('\xc2\xa0',
1098 ' '),
1099 # 3.16 Non-ASCII multibyte space character U+1680.
1100 ('\xe1\x9a\x80',
1101 None),
1102 # 3.17 Non-ASCII multibyte space character U+2000.
1103 ('\xe2\x80\x80',
1104 ' '),
1105 # 3.18 Zero Width Space U+200b.
1106 ('\xe2\x80\x8b',
1107 ''),
1108 # 3.19 Non-ASCII multibyte space character U+3000.
1109 ('\xe3\x80\x80',
1110 ' '),
1111 # 3.20 ASCII control characters U+0010 U+007F.
1112 ('\x10\x7f',
1113 '\x10\x7f'),
1114 # 3.21 Non-ASCII 8bit control character U+0085.
1115 ('\xc2\x85',
1116 None),
1117 # 3.22 Non-ASCII multibyte control character U+180E.
1118 ('\xe1\xa0\x8e',
1119 None),
1120 # 3.23 Zero Width No-Break Space U+FEFF.
1121 ('\xef\xbb\xbf',
1122 ''),
1123 # 3.24 Non-ASCII control character U+1D175.
1124 ('\xf0\x9d\x85\xb5',
1125 None),
1126 # 3.25 Plane 0 private use character U+F123.
1127 ('\xef\x84\xa3',
1128 None),
1129 # 3.26 Plane 15 private use character U+F1234.
1130 ('\xf3\xb1\x88\xb4',
1131 None),
1132 # 3.27 Plane 16 private use character U+10F234.
1133 ('\xf4\x8f\x88\xb4',
1134 None),
1135 # 3.28 Non-character code point U+8FFFE.
1136 ('\xf2\x8f\xbf\xbe',
1137 None),
1138 # 3.29 Non-character code point U+10FFFF.
1139 ('\xf4\x8f\xbf\xbf',
1140 None),
1141 # 3.30 Surrogate code U+DF42.
1142 ('\xed\xbd\x82',
1143 None),
1144 # 3.31 Non-plain text character U+FFFD.
1145 ('\xef\xbf\xbd',
1146 None),
1147 # 3.32 Ideographic description character U+2FF5.
1148 ('\xe2\xbf\xb5',
1149 None),
1150 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001151 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 '\xcc\x81'),
1153 # 3.34 Left-to-right mark U+200E.
1154 ('\xe2\x80\x8e',
1155 None),
1156 # 3.35 Deprecated U+202A.
1157 ('\xe2\x80\xaa',
1158 None),
1159 # 3.36 Language tagging character U+E0001.
1160 ('\xf3\xa0\x80\x81',
1161 None),
1162 # 3.37 Language tagging character U+E0042.
1163 ('\xf3\xa0\x81\x82',
1164 None),
1165 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1166 ('foo\xd6\xbebar',
1167 None),
1168 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1169 ('foo\xef\xb5\x90bar',
1170 None),
1171 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1172 ('foo\xef\xb9\xb6bar',
1173 'foo \xd9\x8ebar'),
1174 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1175 ('\xd8\xa71',
1176 None),
1177 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1178 ('\xd8\xa71\xd8\xa8',
1179 '\xd8\xa71\xd8\xa8'),
1180 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001181 # Skip this test as we allow unassigned
1182 #('\xf3\xa0\x80\x82',
1183 # None),
1184 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 # 3.44 Larger test (shrinking).
1186 # Original test case reads \xc3\xdf
1187 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1188 '\xaa\xce\xb0\xe2\x80\x80',
1189 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1190 # 3.45 Larger test (expanding).
1191 # Original test case reads \xc3\x9f
1192 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1193 '\x80',
1194 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1195 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1196 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1197 ]
1198
1199
1200class NameprepTest(unittest.TestCase):
1201 def test_nameprep(self):
1202 from encodings.idna import nameprep
1203 for pos, (orig, prepped) in enumerate(nameprep_tests):
1204 if orig is None:
1205 # Skipped
1206 continue
1207 # The Unicode strings are given in UTF-8
1208 orig = unicode(orig, "utf-8")
1209 if prepped is None:
1210 # Input contains prohibited characters
1211 self.assertRaises(UnicodeError, nameprep, orig)
1212 else:
1213 prepped = unicode(prepped, "utf-8")
1214 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001215 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 except Exception,e:
1217 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1218
Walter Dörwald78a0be62006-04-14 18:25:39 +00001219class IDNACodecTest(unittest.TestCase):
1220 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001221 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1222 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1223 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1224 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001225
1226 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(u"python.org".encode("idna"), "python.org")
1228 self.assertEqual("python.org.".encode("idna"), "python.org.")
1229 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1230 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001231
Martin v. Löwis8b595142005-08-25 11:03:38 +00001232 def test_stream(self):
1233 import StringIO
1234 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1235 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001236 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001237
Walter Dörwald78a0be62006-04-14 18:25:39 +00001238 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001239 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001240 "".join(codecs.iterdecode("python.org", "idna")),
1241 u"python.org"
1242 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001243 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001244 "".join(codecs.iterdecode("python.org.", "idna")),
1245 u"python.org."
1246 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001247 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001248 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1249 u"pyth\xf6n.org."
1250 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001251 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1253 u"pyth\xf6n.org."
1254 )
1255
1256 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001257 self.assertEqual(decoder.decode("xn--xam", ), u"")
1258 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1259 self.assertEqual(decoder.decode(u"rg"), u"")
1260 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001261
1262 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001263 self.assertEqual(decoder.decode("xn--xam", ), u"")
1264 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1265 self.assertEqual(decoder.decode("rg."), u"org.")
1266 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001267
1268 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001269 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001270 "".join(codecs.iterencode(u"python.org", "idna")),
1271 "python.org"
1272 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001273 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001274 "".join(codecs.iterencode(u"python.org.", "idna")),
1275 "python.org."
1276 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001277 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001278 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1279 "xn--pythn-mua.org."
1280 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001281 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001282 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1283 "xn--pythn-mua.org."
1284 )
1285
1286 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001287 self.assertEqual(encoder.encode(u"\xe4x"), "")
1288 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1289 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001290
1291 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001292 self.assertEqual(encoder.encode(u"\xe4x"), "")
1293 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1294 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001295
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001296class CodecsModuleTest(unittest.TestCase):
1297
1298 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001299 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001300 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001301 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001303 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1304
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001305 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001306 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001307 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001308 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001309 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1312
1313 def test_register(self):
1314 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001315 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001316
1317 def test_lookup(self):
1318 self.assertRaises(TypeError, codecs.lookup)
1319 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001320 self.assertRaises(LookupError, codecs.lookup, " ")
1321
1322 def test_getencoder(self):
1323 self.assertRaises(TypeError, codecs.getencoder)
1324 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1325
1326 def test_getdecoder(self):
1327 self.assertRaises(TypeError, codecs.getdecoder)
1328 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1329
1330 def test_getreader(self):
1331 self.assertRaises(TypeError, codecs.getreader)
1332 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1333
1334 def test_getwriter(self):
1335 self.assertRaises(TypeError, codecs.getwriter)
1336 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001337
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001338 def test_lookup_issue1813(self):
1339 # Issue #1813: under Turkish locales, lookup of some codecs failed
1340 # because 'I' is lowercased as a dotless "i"
1341 oldlocale = locale.getlocale(locale.LC_CTYPE)
1342 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1343 try:
1344 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1345 except locale.Error:
1346 # Unsupported locale on this system
1347 self.skipTest('test needs Turkish locale')
1348 c = codecs.lookup('ASCII')
1349 self.assertEqual(c.name, 'ascii')
1350
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001351class StreamReaderTest(unittest.TestCase):
1352
1353 def setUp(self):
1354 self.reader = codecs.getreader('utf-8')
1355 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1356
1357 def test_readlines(self):
1358 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001359 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001360
Georg Brandl8f99f812006-10-29 08:39:22 +00001361class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001362
Georg Brandl8f99f812006-10-29 08:39:22 +00001363 def test_basic(self):
1364 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001365 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001366 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001367
1368 f = StringIO.StringIO()
1369 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1370 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001371 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001372
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001373class Str2StrTest(unittest.TestCase):
1374
1375 def test_read(self):
1376 sin = "\x80".encode("base64_codec")
1377 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1378 sout = reader.read()
1379 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001380 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001381
1382 def test_readline(self):
1383 sin = "\x80".encode("base64_codec")
1384 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1385 sout = reader.readline()
1386 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001387 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001388
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001389all_unicode_encodings = [
1390 "ascii",
1391 "base64_codec",
1392 "big5",
1393 "big5hkscs",
1394 "charmap",
1395 "cp037",
1396 "cp1006",
1397 "cp1026",
1398 "cp1140",
1399 "cp1250",
1400 "cp1251",
1401 "cp1252",
1402 "cp1253",
1403 "cp1254",
1404 "cp1255",
1405 "cp1256",
1406 "cp1257",
1407 "cp1258",
1408 "cp424",
1409 "cp437",
1410 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001411 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 "cp737",
1413 "cp775",
1414 "cp850",
1415 "cp852",
1416 "cp855",
1417 "cp856",
1418 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001419 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001420 "cp860",
1421 "cp861",
1422 "cp862",
1423 "cp863",
1424 "cp864",
1425 "cp865",
1426 "cp866",
1427 "cp869",
1428 "cp874",
1429 "cp875",
1430 "cp932",
1431 "cp949",
1432 "cp950",
1433 "euc_jis_2004",
1434 "euc_jisx0213",
1435 "euc_jp",
1436 "euc_kr",
1437 "gb18030",
1438 "gb2312",
1439 "gbk",
1440 "hex_codec",
1441 "hp_roman8",
1442 "hz",
1443 "idna",
1444 "iso2022_jp",
1445 "iso2022_jp_1",
1446 "iso2022_jp_2",
1447 "iso2022_jp_2004",
1448 "iso2022_jp_3",
1449 "iso2022_jp_ext",
1450 "iso2022_kr",
1451 "iso8859_1",
1452 "iso8859_10",
1453 "iso8859_11",
1454 "iso8859_13",
1455 "iso8859_14",
1456 "iso8859_15",
1457 "iso8859_16",
1458 "iso8859_2",
1459 "iso8859_3",
1460 "iso8859_4",
1461 "iso8859_5",
1462 "iso8859_6",
1463 "iso8859_7",
1464 "iso8859_8",
1465 "iso8859_9",
1466 "johab",
1467 "koi8_r",
1468 "koi8_u",
1469 "latin_1",
1470 "mac_cyrillic",
1471 "mac_greek",
1472 "mac_iceland",
1473 "mac_latin2",
1474 "mac_roman",
1475 "mac_turkish",
1476 "palmos",
1477 "ptcp154",
1478 "punycode",
1479 "raw_unicode_escape",
1480 "rot_13",
1481 "shift_jis",
1482 "shift_jis_2004",
1483 "shift_jisx0213",
1484 "tis_620",
1485 "unicode_escape",
1486 "unicode_internal",
1487 "utf_16",
1488 "utf_16_be",
1489 "utf_16_le",
1490 "utf_7",
1491 "utf_8",
1492]
1493
1494if hasattr(codecs, "mbcs_encode"):
1495 all_unicode_encodings.append("mbcs")
1496
1497# The following encodings work only with str, not unicode
1498all_string_encodings = [
1499 "quopri_codec",
1500 "string_escape",
1501 "uu_codec",
1502]
1503
1504# The following encoding is not tested, because it's not supposed
1505# to work:
1506# "undefined"
1507
1508# The following encodings don't work in stateful mode
1509broken_unicode_with_streams = [
1510 "base64_codec",
1511 "hex_codec",
1512 "punycode",
1513 "unicode_internal"
1514]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001515broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001516
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001517# The following encodings only support "strict" mode
1518only_strict_mode = [
1519 "idna",
1520 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001521 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001522]
1523
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001524try:
1525 import bz2
1526except ImportError:
1527 pass
1528else:
1529 all_unicode_encodings.append("bz2_codec")
1530 broken_unicode_with_streams.append("bz2_codec")
1531
1532try:
1533 import zlib
1534except ImportError:
1535 pass
1536else:
1537 all_unicode_encodings.append("zlib_codec")
1538 broken_unicode_with_streams.append("zlib_codec")
1539
1540class BasicUnicodeTest(unittest.TestCase):
1541 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001542 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001543 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001544 name = codecs.lookup(encoding).name
1545 if encoding.endswith("_codec"):
1546 name += "_codec"
1547 elif encoding == "latin_1":
1548 name = "latin_1"
1549 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001550 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001551 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001552 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001553 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001554
1555 if encoding not in broken_unicode_with_streams:
1556 # check stream reader/writer
1557 q = Queue()
1558 writer = codecs.getwriter(encoding)(q)
1559 encodedresult = ""
1560 for c in s:
1561 writer.write(c)
1562 encodedresult += q.read()
1563 q = Queue()
1564 reader = codecs.getreader(encoding)(q)
1565 decodedresult = u""
1566 for c in encodedresult:
1567 q.write(c)
1568 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001569 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001570
Georg Brandl2c9838e2006-10-29 14:39:09 +00001571 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001572 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001573 try:
1574 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001575 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001576 pass
1577 else:
1578 # check incremental decoder/encoder
1579 encodedresult = ""
1580 for c in s:
1581 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001582 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001583 decoder = codecs.getincrementaldecoder(encoding)()
1584 decodedresult = u""
1585 for c in encodedresult:
1586 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001587 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001588 self.assertEqual(decodedresult, s,
1589 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001590
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001591 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001592 result = u"".join(codecs.iterdecode(
1593 codecs.iterencode(s, encoding), encoding))
1594 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001595
1596 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001597 result = u"".join(codecs.iterdecode(
1598 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001599 self.assertEqual(result, u"")
1600
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001601 if encoding not in only_strict_mode:
1602 # check incremental decoder/encoder with errors argument
1603 try:
1604 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001605 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001606 pass
1607 else:
1608 encodedresult = "".join(encoder.encode(c) for c in s)
1609 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001610 decodedresult = u"".join(decoder.decode(c)
1611 for c in encodedresult)
1612 self.assertEqual(decodedresult, s,
1613 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001614
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001615 @test_support.cpython_only
1616 def test_basics_capi(self):
1617 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1618 s = u"abc123" # all codecs should be able to encode these
1619 for encoding in all_unicode_encodings:
1620 if encoding not in broken_incremental_coders:
1621 # check incremental decoder/encoder and iterencode()/iterdecode()
1622 try:
1623 cencoder = codec_incrementalencoder(encoding)
1624 except LookupError: # no IncrementalEncoder
1625 pass
1626 else:
1627 # check C API
1628 encodedresult = ""
1629 for c in s:
1630 encodedresult += cencoder.encode(c)
1631 encodedresult += cencoder.encode(u"", True)
1632 cdecoder = codec_incrementaldecoder(encoding)
1633 decodedresult = u""
1634 for c in encodedresult:
1635 decodedresult += cdecoder.decode(c)
1636 decodedresult += cdecoder.decode("", True)
1637 self.assertEqual(decodedresult, s,
1638 "encoding=%r" % encoding)
1639
1640 if encoding not in only_strict_mode:
1641 # check incremental decoder/encoder with errors argument
1642 try:
1643 cencoder = codec_incrementalencoder(encoding, "ignore")
1644 except LookupError: # no IncrementalEncoder
1645 pass
1646 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001647 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001648 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1649 decodedresult = u"".join(cdecoder.decode(c)
1650 for c in encodedresult)
1651 self.assertEqual(decodedresult, s,
1652 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001653
Walter Dörwald729c31f2005-03-14 19:06:30 +00001654 def test_seek(self):
1655 # all codecs should be able to encode these
1656 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1657 for encoding in all_unicode_encodings:
1658 if encoding == "idna": # FIXME: See SF bug #1163178
1659 continue
1660 if encoding in broken_unicode_with_streams:
1661 continue
1662 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1663 for t in xrange(5):
1664 # Test that calling seek resets the internal codec state and buffers
1665 reader.seek(0, 0)
1666 line = reader.readline()
1667 self.assertEqual(s[:len(line)], line)
1668
Walter Dörwalde22d3392005-11-17 08:52:34 +00001669 def test_bad_decode_args(self):
1670 for encoding in all_unicode_encodings:
1671 decoder = codecs.getdecoder(encoding)
1672 self.assertRaises(TypeError, decoder)
1673 if encoding not in ("idna", "punycode"):
1674 self.assertRaises(TypeError, decoder, 42)
1675
1676 def test_bad_encode_args(self):
1677 for encoding in all_unicode_encodings:
1678 encoder = codecs.getencoder(encoding)
1679 self.assertRaises(TypeError, encoder)
1680
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001681 def test_encoding_map_type_initialized(self):
1682 from encodings import cp1140
1683 # This used to crash, we are only verifying there's no crash.
1684 table_type = type(cp1140.encoding_table)
1685 self.assertEqual(table_type, table_type)
1686
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001687class BasicStrTest(unittest.TestCase):
1688 def test_basics(self):
1689 s = "abc123"
1690 for encoding in all_string_encodings:
1691 (bytes, size) = codecs.getencoder(encoding)(s)
1692 self.assertEqual(size, len(s))
1693 (chars, size) = codecs.getdecoder(encoding)(bytes)
1694 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1695
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001696class CharmapTest(unittest.TestCase):
1697 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001698 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001699 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1700 (u"abc", 3)
1701 )
1702
Serhiy Storchaka95997452013-01-15 14:42:59 +02001703 self.assertRaises(UnicodeDecodeError,
1704 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1705 )
1706
1707 self.assertRaises(UnicodeDecodeError,
1708 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1709 )
1710
Ezio Melotti2623a372010-11-21 13:34:58 +00001711 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001712 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1713 (u"ab\ufffd", 3)
1714 )
1715
Ezio Melotti2623a372010-11-21 13:34:58 +00001716 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001717 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1718 (u"ab\ufffd", 3)
1719 )
1720
Ezio Melotti2623a372010-11-21 13:34:58 +00001721 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001722 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1723 (u"ab", 3)
1724 )
1725
Ezio Melotti2623a372010-11-21 13:34:58 +00001726 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001727 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1728 (u"ab", 3)
1729 )
1730
1731 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001732 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001733 codecs.charmap_decode(allbytes, "ignore", u""),
1734 (u"", len(allbytes))
1735 )
1736
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001737 def test_decode_with_int2str_map(self):
1738 self.assertEqual(
1739 codecs.charmap_decode("\x00\x01\x02", "strict",
1740 {0: u'a', 1: u'b', 2: u'c'}),
1741 (u"abc", 3)
1742 )
1743
1744 self.assertEqual(
1745 codecs.charmap_decode("\x00\x01\x02", "strict",
1746 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1747 (u"AaBbCc", 3)
1748 )
1749
1750 self.assertEqual(
1751 codecs.charmap_decode("\x00\x01\x02", "strict",
1752 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1753 (u"\U0010FFFFbc", 3)
1754 )
1755
1756 self.assertEqual(
1757 codecs.charmap_decode("\x00\x01\x02", "strict",
1758 {0: u'a', 1: u'b', 2: u''}),
1759 (u"ab", 3)
1760 )
1761
1762 self.assertRaises(UnicodeDecodeError,
1763 codecs.charmap_decode, "\x00\x01\x02", "strict",
1764 {0: u'a', 1: u'b'}
1765 )
1766
Serhiy Storchaka95997452013-01-15 14:42:59 +02001767 self.assertRaises(UnicodeDecodeError,
1768 codecs.charmap_decode, "\x00\x01\x02", "strict",
1769 {0: u'a', 1: u'b', 2: None}
1770 )
1771
1772 # Issue #14850
1773 self.assertRaises(UnicodeDecodeError,
1774 codecs.charmap_decode, "\x00\x01\x02", "strict",
1775 {0: u'a', 1: u'b', 2: u'\ufffe'}
1776 )
1777
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001778 self.assertEqual(
1779 codecs.charmap_decode("\x00\x01\x02", "replace",
1780 {0: u'a', 1: u'b'}),
1781 (u"ab\ufffd", 3)
1782 )
1783
1784 self.assertEqual(
1785 codecs.charmap_decode("\x00\x01\x02", "replace",
1786 {0: u'a', 1: u'b', 2: None}),
1787 (u"ab\ufffd", 3)
1788 )
1789
Serhiy Storchaka95997452013-01-15 14:42:59 +02001790 # Issue #14850
1791 self.assertEqual(
1792 codecs.charmap_decode("\x00\x01\x02", "replace",
1793 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1794 (u"ab\ufffd", 3)
1795 )
1796
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001797 self.assertEqual(
1798 codecs.charmap_decode("\x00\x01\x02", "ignore",
1799 {0: u'a', 1: u'b'}),
1800 (u"ab", 3)
1801 )
1802
1803 self.assertEqual(
1804 codecs.charmap_decode("\x00\x01\x02", "ignore",
1805 {0: u'a', 1: u'b', 2: None}),
1806 (u"ab", 3)
1807 )
1808
Serhiy Storchaka95997452013-01-15 14:42:59 +02001809 # Issue #14850
1810 self.assertEqual(
1811 codecs.charmap_decode("\x00\x01\x02", "ignore",
1812 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1813 (u"ab", 3)
1814 )
1815
1816 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001817 self.assertEqual(
1818 codecs.charmap_decode(allbytes, "ignore", {}),
1819 (u"", len(allbytes))
1820 )
1821
1822 def test_decode_with_int2int_map(self):
1823 a = ord(u'a')
1824 b = ord(u'b')
1825 c = ord(u'c')
1826
1827 self.assertEqual(
1828 codecs.charmap_decode("\x00\x01\x02", "strict",
1829 {0: a, 1: b, 2: c}),
1830 (u"abc", 3)
1831 )
1832
1833 # Issue #15379
1834 self.assertEqual(
1835 codecs.charmap_decode("\x00\x01\x02", "strict",
1836 {0: 0x10FFFF, 1: b, 2: c}),
1837 (u"\U0010FFFFbc", 3)
1838 )
1839
1840 self.assertRaises(TypeError,
1841 codecs.charmap_decode, "\x00\x01\x02", "strict",
1842 {0: 0x110000, 1: b, 2: c}
1843 )
1844
1845 self.assertRaises(UnicodeDecodeError,
1846 codecs.charmap_decode, "\x00\x01\x02", "strict",
1847 {0: a, 1: b},
1848 )
1849
Serhiy Storchaka95997452013-01-15 14:42:59 +02001850 self.assertRaises(UnicodeDecodeError,
1851 codecs.charmap_decode, "\x00\x01\x02", "strict",
1852 {0: a, 1: b, 2: 0xFFFE},
1853 )
1854
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001855 self.assertEqual(
1856 codecs.charmap_decode("\x00\x01\x02", "replace",
1857 {0: a, 1: b}),
1858 (u"ab\ufffd", 3)
1859 )
1860
1861 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001862 codecs.charmap_decode("\x00\x01\x02", "replace",
1863 {0: a, 1: b, 2: 0xFFFE}),
1864 (u"ab\ufffd", 3)
1865 )
1866
1867 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001868 codecs.charmap_decode("\x00\x01\x02", "ignore",
1869 {0: a, 1: b}),
1870 (u"ab", 3)
1871 )
1872
Serhiy Storchaka95997452013-01-15 14:42:59 +02001873 self.assertEqual(
1874 codecs.charmap_decode("\x00\x01\x02", "ignore",
1875 {0: a, 1: b, 2: 0xFFFE}),
1876 (u"ab", 3)
1877 )
1878
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001879
Georg Brandl8f99f812006-10-29 08:39:22 +00001880class WithStmtTest(unittest.TestCase):
1881 def test_encodedfile(self):
1882 f = StringIO.StringIO("\xc3\xbc")
1883 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001884 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001885
1886 def test_streamreaderwriter(self):
1887 f = StringIO.StringIO("\xc3\xbc")
1888 info = codecs.lookup("utf-8")
1889 with codecs.StreamReaderWriter(f, info.streamreader,
1890 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001891 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001892
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001893
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001894class UnicodeEscapeTest(unittest.TestCase):
1895 def test_empty(self):
1896 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1897 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1898
1899 def test_raw_encode(self):
1900 encode = codecs.unicode_escape_encode
1901 for b in range(32, 127):
1902 if b != ord('\\'):
1903 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1904
1905 def test_raw_decode(self):
1906 decode = codecs.unicode_escape_decode
1907 for b in range(256):
1908 if b != ord('\\'):
1909 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1910
1911 def test_escape_encode(self):
1912 encode = codecs.unicode_escape_encode
1913 check = coding_checker(self, encode)
1914 check(u'\t', r'\t')
1915 check(u'\n', r'\n')
1916 check(u'\r', r'\r')
1917 check(u'\\', r'\\')
1918 for b in range(32):
1919 if chr(b) not in '\t\n\r':
1920 check(unichr(b), '\\x%02x' % b)
1921 for b in range(127, 256):
1922 check(unichr(b), '\\x%02x' % b)
1923 check(u'\u20ac', r'\u20ac')
1924 check(u'\U0001d120', r'\U0001d120')
1925
1926 def test_escape_decode(self):
1927 decode = codecs.unicode_escape_decode
1928 check = coding_checker(self, decode)
1929 check("[\\\n]", u"[]")
1930 check(r'[\"]', u'["]')
1931 check(r"[\']", u"[']")
1932 check(r"[\\]", ur"[\]")
1933 check(r"[\a]", u"[\x07]")
1934 check(r"[\b]", u"[\x08]")
1935 check(r"[\t]", u"[\x09]")
1936 check(r"[\n]", u"[\x0a]")
1937 check(r"[\v]", u"[\x0b]")
1938 check(r"[\f]", u"[\x0c]")
1939 check(r"[\r]", u"[\x0d]")
1940 check(r"[\7]", u"[\x07]")
1941 check(r"[\8]", ur"[\8]")
1942 check(r"[\78]", u"[\x078]")
1943 check(r"[\41]", u"[!]")
1944 check(r"[\418]", u"[!8]")
1945 check(r"[\101]", u"[A]")
1946 check(r"[\1010]", u"[A0]")
1947 check(r"[\x41]", u"[A]")
1948 check(r"[\x410]", u"[A0]")
1949 check(r"\u20ac", u"\u20ac")
1950 check(r"\U0001d120", u"\U0001d120")
1951 for b in range(256):
1952 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1953 check('\\' + chr(b), u'\\' + unichr(b))
1954
1955 def test_decode_errors(self):
1956 decode = codecs.unicode_escape_decode
1957 for c, d in ('x', 2), ('u', 4), ('U', 4):
1958 for i in range(d):
1959 self.assertRaises(UnicodeDecodeError, decode,
1960 "\\" + c + "0"*i)
1961 self.assertRaises(UnicodeDecodeError, decode,
1962 "[\\" + c + "0"*i + "]")
1963 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1964 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1965 self.assertEqual(decode(data, "replace"),
1966 (u"[\ufffd]\ufffd", len(data)))
1967 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1968 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1969 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1970
1971
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001972class RawUnicodeEscapeTest(unittest.TestCase):
1973 def test_empty(self):
1974 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1975 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1976
1977 def test_raw_encode(self):
1978 encode = codecs.raw_unicode_escape_encode
1979 for b in range(256):
1980 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1981
1982 def test_raw_decode(self):
1983 decode = codecs.raw_unicode_escape_decode
1984 for b in range(256):
1985 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1986
1987 def test_escape_encode(self):
1988 encode = codecs.raw_unicode_escape_encode
1989 check = coding_checker(self, encode)
1990 for b in range(256):
1991 if chr(b) not in 'uU':
1992 check(u'\\' + unichr(b), '\\' + chr(b))
1993 check(u'\u20ac', r'\u20ac')
1994 check(u'\U0001d120', r'\U0001d120')
1995
1996 def test_escape_decode(self):
1997 decode = codecs.raw_unicode_escape_decode
1998 check = coding_checker(self, decode)
1999 for b in range(256):
2000 if chr(b) not in 'uU':
2001 check('\\' + chr(b), u'\\' + unichr(b))
2002 check(r"\u20ac", u"\u20ac")
2003 check(r"\U0001d120", u"\U0001d120")
2004
2005 def test_decode_errors(self):
2006 decode = codecs.raw_unicode_escape_decode
2007 for c, d in ('u', 4), ('U', 4):
2008 for i in range(d):
2009 self.assertRaises(UnicodeDecodeError, decode,
2010 "\\" + c + "0"*i)
2011 self.assertRaises(UnicodeDecodeError, decode,
2012 "[\\" + c + "0"*i + "]")
2013 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2014 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2015 self.assertEqual(decode(data, "replace"),
2016 (u"[\ufffd]\ufffd", len(data)))
2017 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2018 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2019 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2020
2021
Victor Stinner262be5e2010-05-22 02:11:07 +00002022class BomTest(unittest.TestCase):
2023 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002024 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002025 tests = ("utf-16",
2026 "utf-16-le",
2027 "utf-16-be",
2028 "utf-32",
2029 "utf-32-le",
2030 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002031 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002032 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002033 # Check if the BOM is written only once
2034 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002035 f.write(data)
2036 f.write(data)
2037 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002038 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002039 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002040 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002041
Victor Stinner7df55da2010-05-22 13:37:56 +00002042 # Check that the BOM is written after a seek(0)
2043 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2044 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002045 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002046 f.seek(0)
2047 f.write(data)
2048 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002049 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002050
2051 # (StreamWriter) Check that the BOM is written after a seek(0)
2052 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2053 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002054 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002055 f.writer.seek(0)
2056 f.writer.write(data)
2057 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002058 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002059
2060 # Check that the BOM is not written after a seek() at a position
2061 # different than the start
2062 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2063 f.write(data)
2064 f.seek(f.tell())
2065 f.write(data)
2066 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002067 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002068
2069 # (StreamWriter) Check that the BOM is not written after a seek()
2070 # at a position different than the start
2071 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2072 f.writer.write(data)
2073 f.writer.seek(f.writer.tell())
2074 f.writer.write(data)
2075 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002076 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002077
Victor Stinner262be5e2010-05-22 02:11:07 +00002078
Fred Drake2e2be372001-09-20 21:33:42 +00002079def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002080 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002081 UTF32Test,
2082 UTF32LETest,
2083 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002084 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002085 UTF16LETest,
2086 UTF16BETest,
2087 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002088 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002089 UTF7Test,
2090 UTF16ExTest,
2091 ReadBufferTest,
2092 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002093 EscapeDecodeTest,
2094 RecodingTest,
2095 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002096 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002097 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002098 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002099 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002100 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002101 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002102 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002103 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002104 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002105 CharmapTest,
2106 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002107 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002108 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002109 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002110 )
Fred Drake2e2be372001-09-20 21:33:42 +00002111
Serhiy Storchakab4f3d802014-11-07 14:07:43 +02002112 def test_uu_invalid(self):
2113 # Missing "begin" line
2114 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2115
Fred Drake2e2be372001-09-20 21:33:42 +00002116
2117if __name__ == "__main__":
2118 test_main()