blob: 0ccf8183e57c4c78eb18de07eed91acf74e8dd02 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
100 # Test long lines (multiple calls to read() in readline())
101 vw = []
102 vwo = []
103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
104 vw.append((i*200)*u"\3042" + lineend)
105 vwo.append((i*200)*u"\3042")
106 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
108
109 # Test lines where the first read might end with \r, so the
110 # reader has to look ahead whether this is a lone \r or a \r\n
111 for size in xrange(80):
112 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000113 s = 10*(size*u"a" + lineend + u"xxx\n")
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=True),
118 size*u"a" + lineend,
119 )
120 reader = getreader(s)
121 for i in xrange(10):
122 self.assertEqual(
123 reader.readline(keepends=False),
124 size*u"a",
125 )
126
127 def test_bug1175396(self):
128 s = [
129 '<%!--===================================================\r\n',
130 ' BLOG index page: show recent articles,\r\n',
131 ' today\'s articles, or articles of a specific date.\r\n',
132 '========================================================--%>\r\n',
133 '<%@inputencoding="ISO-8859-1"%>\r\n',
134 '<%@pagetemplate=TEMPLATE.y%>\r\n',
135 '<%@import=import frog.util, frog%>\r\n',
136 '<%@import=import frog.objects%>\r\n',
137 '<%@import=from frog.storageerrors import StorageError%>\r\n',
138 '<%\r\n',
139 '\r\n',
140 'import logging\r\n',
141 'log=logging.getLogger("Snakelets.logger")\r\n',
142 '\r\n',
143 '\r\n',
144 'user=self.SessionCtx.user\r\n',
145 'storageEngine=self.SessionCtx.storageEngine\r\n',
146 '\r\n',
147 '\r\n',
148 'def readArticlesFromDate(date, count=None):\r\n',
149 ' entryids=storageEngine.listBlogEntries(date)\r\n',
150 ' entryids.reverse() # descending\r\n',
151 ' if count:\r\n',
152 ' entryids=entryids[:count]\r\n',
153 ' try:\r\n',
154 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
155 ' except StorageError,x:\r\n',
156 ' log.error("Error loading articles: "+str(x))\r\n',
157 ' self.abort("cannot load articles")\r\n',
158 '\r\n',
159 'showdate=None\r\n',
160 '\r\n',
161 'arg=self.Request.getArg()\r\n',
162 'if arg=="today":\r\n',
163 ' #-------------------- TODAY\'S ARTICLES\r\n',
164 ' self.write("<h2>Today\'s articles</h2>")\r\n',
165 ' showdate = frog.util.isodatestr() \r\n',
166 ' entries = readArticlesFromDate(showdate)\r\n',
167 'elif arg=="active":\r\n',
168 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
169 ' self.Yredirect("active.y")\r\n',
170 'elif arg=="login":\r\n',
171 ' #-------------------- LOGIN PAGE redirect\r\n',
172 ' self.Yredirect("login.y")\r\n',
173 'elif arg=="date":\r\n',
174 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
175 ' showdate = self.Request.getParameter("date")\r\n',
176 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
177 ' entries = readArticlesFromDate(showdate)\r\n',
178 'else:\r\n',
179 ' #-------------------- RECENT ARTICLES\r\n',
180 ' self.write("<h2>Recent articles</h2>")\r\n',
181 ' dates=storageEngine.listBlogEntryDates()\r\n',
182 ' if dates:\r\n',
183 ' entries=[]\r\n',
184 ' SHOWAMOUNT=10\r\n',
185 ' for showdate in dates:\r\n',
186 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
187 ' if len(entries)>=SHOWAMOUNT:\r\n',
188 ' break\r\n',
189 ' \r\n',
190 ]
191 stream = StringIO.StringIO("".join(s).encode(self.encoding))
192 reader = codecs.getreader(self.encoding)(stream)
193 for (i, line) in enumerate(reader):
194 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000195
196 def test_readlinequeue(self):
197 q = Queue()
198 writer = codecs.getwriter(self.encoding)(q)
199 reader = codecs.getreader(self.encoding)(q)
200
201 # No lineends
202 writer.write(u"foo\r")
203 self.assertEqual(reader.readline(keepends=False), u"foo")
204 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000205 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000206 self.assertEqual(reader.readline(keepends=False), u"bar")
207 writer.write(u"baz")
208 self.assertEqual(reader.readline(keepends=False), u"baz")
209 self.assertEqual(reader.readline(keepends=False), u"")
210
211 # Lineends
212 writer.write(u"foo\r")
213 self.assertEqual(reader.readline(keepends=True), u"foo\r")
214 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000215 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216 self.assertEqual(reader.readline(keepends=True), u"bar\r")
217 writer.write(u"baz")
218 self.assertEqual(reader.readline(keepends=True), u"baz")
219 self.assertEqual(reader.readline(keepends=True), u"")
220 writer.write(u"foo\r\n")
221 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
222
Walter Dörwald9fa09462005-01-10 12:01:39 +0000223 def test_bug1098990_a(self):
224 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
225 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
226 s3 = u"next line.\r\n"
227
228 s = (s1+s2+s3).encode(self.encoding)
229 stream = StringIO.StringIO(s)
230 reader = codecs.getreader(self.encoding)(stream)
231 self.assertEqual(reader.readline(), s1)
232 self.assertEqual(reader.readline(), s2)
233 self.assertEqual(reader.readline(), s3)
234 self.assertEqual(reader.readline(), u"")
235
236 def test_bug1098990_b(self):
237 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
238 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
239 s3 = u"stillokay:bbbbxx\r\n"
240 s4 = u"broken!!!!badbad\r\n"
241 s5 = u"againokay.\r\n"
242
243 s = (s1+s2+s3+s4+s5).encode(self.encoding)
244 stream = StringIO.StringIO(s)
245 reader = codecs.getreader(self.encoding)(stream)
246 self.assertEqual(reader.readline(), s1)
247 self.assertEqual(reader.readline(), s2)
248 self.assertEqual(reader.readline(), s3)
249 self.assertEqual(reader.readline(), s4)
250 self.assertEqual(reader.readline(), s5)
251 self.assertEqual(reader.readline(), u"")
252
Walter Dörwald6e390802007-08-17 16:41:28 +0000253class UTF32Test(ReadTest):
254 encoding = "utf-32"
255
256 spamle = ('\xff\xfe\x00\x00'
257 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
258 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
259 spambe = ('\x00\x00\xfe\xff'
260 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
261 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
262
263 def test_only_one_bom(self):
264 _,_,reader,writer = codecs.lookup(self.encoding)
265 # encode some stream
266 s = StringIO.StringIO()
267 f = writer(s)
268 f.write(u"spam")
269 f.write(u"spam")
270 d = s.getvalue()
271 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000272 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000273 # try to read it back
274 s = StringIO.StringIO(d)
275 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000276 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000277
278 def test_badbom(self):
279 s = StringIO.StringIO(4*"\xff")
280 f = codecs.getreader(self.encoding)(s)
281 self.assertRaises(UnicodeError, f.read)
282
283 s = StringIO.StringIO(8*"\xff")
284 f = codecs.getreader(self.encoding)(s)
285 self.assertRaises(UnicodeError, f.read)
286
287 def test_partial(self):
288 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200289 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000290 [
291 u"", # first byte of BOM read
292 u"", # second byte of BOM read
293 u"", # third byte of BOM read
294 u"", # fourth byte of BOM read => byteorder known
295 u"",
296 u"",
297 u"",
298 u"\x00",
299 u"\x00",
300 u"\x00",
301 u"\x00",
302 u"\x00\xff",
303 u"\x00\xff",
304 u"\x00\xff",
305 u"\x00\xff",
306 u"\x00\xff\u0100",
307 u"\x00\xff\u0100",
308 u"\x00\xff\u0100",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200311 u"\x00\xff\u0100\uffff",
312 u"\x00\xff\u0100\uffff",
313 u"\x00\xff\u0100\uffff",
314 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000315 ]
316 )
317
Georg Brandle9741f32009-09-17 11:28:09 +0000318 def test_handlers(self):
319 self.assertEqual((u'\ufffd', 1),
320 codecs.utf_32_decode('\x01', 'replace', True))
321 self.assertEqual((u'', 1),
322 codecs.utf_32_decode('\x01', 'ignore', True))
323
Walter Dörwald6e390802007-08-17 16:41:28 +0000324 def test_errors(self):
325 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
326 "\xff", "strict", True)
327
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000328 def test_issue8941(self):
329 # Issue #8941: insufficient result allocation when decoding into
330 # surrogate pairs on UCS-2 builds.
331 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
332 self.assertEqual(u'\U00010000' * 1024,
333 codecs.utf_32_decode(encoded_le)[0])
334 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
335 self.assertEqual(u'\U00010000' * 1024,
336 codecs.utf_32_decode(encoded_be)[0])
337
Walter Dörwald6e390802007-08-17 16:41:28 +0000338class UTF32LETest(ReadTest):
339 encoding = "utf-32-le"
340
341 def test_partial(self):
342 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200343 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000344 [
345 u"",
346 u"",
347 u"",
348 u"\x00",
349 u"\x00",
350 u"\x00",
351 u"\x00",
352 u"\x00\xff",
353 u"\x00\xff",
354 u"\x00\xff",
355 u"\x00\xff",
356 u"\x00\xff\u0100",
357 u"\x00\xff\u0100",
358 u"\x00\xff\u0100",
359 u"\x00\xff\u0100",
360 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200361 u"\x00\xff\u0100\uffff",
362 u"\x00\xff\u0100\uffff",
363 u"\x00\xff\u0100\uffff",
364 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000365 ]
366 )
367
368 def test_simple(self):
369 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
370
371 def test_errors(self):
372 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
373 "\xff", "strict", True)
374
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000375 def test_issue8941(self):
376 # Issue #8941: insufficient result allocation when decoding into
377 # surrogate pairs on UCS-2 builds.
378 encoded = '\x00\x00\x01\x00' * 1024
379 self.assertEqual(u'\U00010000' * 1024,
380 codecs.utf_32_le_decode(encoded)[0])
381
Walter Dörwald6e390802007-08-17 16:41:28 +0000382class UTF32BETest(ReadTest):
383 encoding = "utf-32-be"
384
385 def test_partial(self):
386 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200387 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000388 [
389 u"",
390 u"",
391 u"",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100",
404 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff",
408 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000409 ]
410 )
411
412 def test_simple(self):
413 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
414
415 def test_errors(self):
416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
417 "\xff", "strict", True)
418
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000419 def test_issue8941(self):
420 # Issue #8941: insufficient result allocation when decoding into
421 # surrogate pairs on UCS-2 builds.
422 encoded = '\x00\x01\x00\x00' * 1024
423 self.assertEqual(u'\U00010000' * 1024,
424 codecs.utf_32_be_decode(encoded)[0])
425
426
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000427class UTF16Test(ReadTest):
428 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000429
430 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
431 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
432
433 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000434 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000435 # encode some stream
436 s = StringIO.StringIO()
437 f = writer(s)
438 f.write(u"spam")
439 f.write(u"spam")
440 d = s.getvalue()
441 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000442 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000443 # try to read it back
444 s = StringIO.StringIO(d)
445 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000446 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000447
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000448 def test_badbom(self):
449 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000450 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000451 self.assertRaises(UnicodeError, f.read)
452
453 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000454 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000455 self.assertRaises(UnicodeError, f.read)
456
Walter Dörwald69652032004-09-07 20:24:22 +0000457 def test_partial(self):
458 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200459 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000460 [
461 u"", # first byte of BOM read
462 u"", # second byte of BOM read => byteorder known
463 u"",
464 u"\x00",
465 u"\x00",
466 u"\x00\xff",
467 u"\x00\xff",
468 u"\x00\xff\u0100",
469 u"\x00\xff\u0100",
470 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200471 u"\x00\xff\u0100\uffff",
472 u"\x00\xff\u0100\uffff",
473 u"\x00\xff\u0100\uffff",
474 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000475 ]
476 )
477
Georg Brandle9741f32009-09-17 11:28:09 +0000478 def test_handlers(self):
479 self.assertEqual((u'\ufffd', 1),
480 codecs.utf_16_decode('\x01', 'replace', True))
481 self.assertEqual((u'', 1),
482 codecs.utf_16_decode('\x01', 'ignore', True))
483
Walter Dörwalde22d3392005-11-17 08:52:34 +0000484 def test_errors(self):
485 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
486
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000487 def test_bug691291(self):
488 # Files are always opened in binary mode, even if no binary mode was
489 # specified. This means that no automatic conversion of '\n' is done
490 # on reading and writing.
491 s1 = u'Hello\r\nworld\r\n'
492
493 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200494 self.addCleanup(test_support.unlink, test_support.TESTFN)
495 with open(test_support.TESTFN, 'wb') as fp:
496 fp.write(s)
497 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
498 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000499
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500class UTF16LETest(ReadTest):
501 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000502
503 def test_partial(self):
504 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200505 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000506 [
507 u"",
508 u"\x00",
509 u"\x00",
510 u"\x00\xff",
511 u"\x00\xff",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff",
518 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000519 ]
520 )
521
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200523 tests = [
524 (b'\xff', u'\ufffd'),
525 (b'A\x00Z', u'A\ufffd'),
526 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
527 (b'\x00\xd8', u'\ufffd'),
528 (b'\x00\xd8A', u'\ufffd'),
529 (b'\x00\xd8A\x00', u'\ufffdA'),
530 (b'\x00\xdcA\x00', u'\ufffdA'),
531 ]
532 for raw, expected in tests:
533 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
534 raw, 'strict', True)
535 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000536
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000537class UTF16BETest(ReadTest):
538 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000539
540 def test_partial(self):
541 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200542 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000543 [
544 u"",
545 u"\x00",
546 u"\x00",
547 u"\x00\xff",
548 u"\x00\xff",
549 u"\x00\xff\u0100",
550 u"\x00\xff\u0100",
551 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200552 u"\x00\xff\u0100\uffff",
553 u"\x00\xff\u0100\uffff",
554 u"\x00\xff\u0100\uffff",
555 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200560 tests = [
561 (b'\xff', u'\ufffd'),
562 (b'\x00A\xff', u'A\ufffd'),
563 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
564 (b'\xd8\x00', u'\ufffd'),
565 (b'\xd8\x00\xdc', u'\ufffd'),
566 (b'\xd8\x00\x00A', u'\ufffdA'),
567 (b'\xdc\x00\x00A', u'\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574class UTF8Test(ReadTest):
575 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000576
577 def test_partial(self):
578 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200579 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000580 [
581 u"\x00",
582 u"\x00",
583 u"\x00\xff",
584 u"\x00\xff",
585 u"\x00\xff\u07ff",
586 u"\x00\xff\u07ff",
587 u"\x00\xff\u07ff",
588 u"\x00\xff\u07ff\u0800",
589 u"\x00\xff\u07ff\u0800",
590 u"\x00\xff\u07ff\u0800",
591 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200592 u"\x00\xff\u07ff\u0800\uffff",
593 u"\x00\xff\u07ff\u0800\uffff",
594 u"\x00\xff\u07ff\u0800\uffff",
595 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000596 ]
597 )
598
Walter Dörwalde22d3392005-11-17 08:52:34 +0000599class UTF7Test(ReadTest):
600 encoding = "utf-7"
601
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000602 def test_partial(self):
603 self.check_partial(
604 u"a+-b",
605 [
606 u"a",
607 u"a",
608 u"a+",
609 u"a+-",
610 u"a+-b",
611 ]
612 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000613
614class UTF16ExTest(unittest.TestCase):
615
616 def test_errors(self):
617 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
618
619 def test_bad_args(self):
620 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
621
622class ReadBufferTest(unittest.TestCase):
623
624 def test_array(self):
625 import array
626 self.assertEqual(
627 codecs.readbuffer_encode(array.array("c", "spam")),
628 ("spam", 4)
629 )
630
631 def test_empty(self):
632 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
633
634 def test_bad_args(self):
635 self.assertRaises(TypeError, codecs.readbuffer_encode)
636 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
637
638class CharBufferTest(unittest.TestCase):
639
640 def test_string(self):
641 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
642
643 def test_empty(self):
644 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
645
646 def test_bad_args(self):
647 self.assertRaises(TypeError, codecs.charbuffer_encode)
648 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
649
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000650class UTF8SigTest(ReadTest):
651 encoding = "utf-8-sig"
652
653 def test_partial(self):
654 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200655 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000656 [
657 u"",
658 u"",
659 u"", # First BOM has been read and skipped
660 u"",
661 u"",
662 u"\ufeff", # Second BOM has been read and emitted
663 u"\ufeff\x00", # "\x00" read and emitted
664 u"\ufeff\x00", # First byte of encoded u"\xff" read
665 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
666 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
667 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
668 u"\ufeff\x00\xff\u07ff",
669 u"\ufeff\x00\xff\u07ff",
670 u"\ufeff\x00\xff\u07ff\u0800",
671 u"\ufeff\x00\xff\u07ff\u0800",
672 u"\ufeff\x00\xff\u07ff\u0800",
673 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200674 u"\ufeff\x00\xff\u07ff\u0800\uffff",
675 u"\ufeff\x00\xff\u07ff\u0800\uffff",
676 u"\ufeff\x00\xff\u07ff\u0800\uffff",
677 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000678 ]
679 )
680
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000681 def test_bug1601501(self):
682 # SF bug #1601501: check that the codec works with a buffer
683 unicode("\xef\xbb\xbf", "utf-8-sig")
684
Walter Dörwald42348272007-04-12 10:35:00 +0000685 def test_bom(self):
686 d = codecs.getincrementaldecoder("utf-8-sig")()
687 s = u"spam"
688 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
689
Walter Dörwald183744d2007-11-19 12:41:10 +0000690 def test_stream_bom(self):
691 unistring = u"ABC\u00A1\u2200XYZ"
692 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
693
694 reader = codecs.getreader("utf-8-sig")
695 for sizehint in [None] + range(1, 11) + \
696 [64, 128, 256, 512, 1024]:
697 istream = reader(StringIO.StringIO(bytestring))
698 ostream = StringIO.StringIO()
699 while 1:
700 if sizehint is not None:
701 data = istream.read(sizehint)
702 else:
703 data = istream.read()
704
705 if not data:
706 break
707 ostream.write(data)
708
709 got = ostream.getvalue()
710 self.assertEqual(got, unistring)
711
712 def test_stream_bare(self):
713 unistring = u"ABC\u00A1\u2200XYZ"
714 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
715
716 reader = codecs.getreader("utf-8-sig")
717 for sizehint in [None] + range(1, 11) + \
718 [64, 128, 256, 512, 1024]:
719 istream = reader(StringIO.StringIO(bytestring))
720 ostream = StringIO.StringIO()
721 while 1:
722 if sizehint is not None:
723 data = istream.read(sizehint)
724 else:
725 data = istream.read()
726
727 if not data:
728 break
729 ostream.write(data)
730
731 got = ostream.getvalue()
732 self.assertEqual(got, unistring)
733
Walter Dörwald8709a422002-09-03 13:53:40 +0000734class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000735 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000736 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000737
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200738 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200739 decode = codecs.escape_decode
740 for b in range(256):
741 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200742 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200743 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200744
745 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200746 decode = codecs.escape_decode
747 check = coding_checker(self, decode)
748 check(b"[\\\n]", b"[]")
749 check(br'[\"]', b'["]')
750 check(br"[\']", b"[']")
751 check(br"[\\]", br"[\]")
752 check(br"[\a]", b"[\x07]")
753 check(br"[\b]", b"[\x08]")
754 check(br"[\t]", b"[\x09]")
755 check(br"[\n]", b"[\x0a]")
756 check(br"[\v]", b"[\x0b]")
757 check(br"[\f]", b"[\x0c]")
758 check(br"[\r]", b"[\x0d]")
759 check(br"[\7]", b"[\x07]")
760 check(br"[\8]", br"[\8]")
761 check(br"[\78]", b"[\x078]")
762 check(br"[\41]", b"[!]")
763 check(br"[\418]", b"[!8]")
764 check(br"[\101]", b"[A]")
765 check(br"[\1010]", b"[A0]")
766 check(br"[\501]", b"[A]")
767 check(br"[\x41]", b"[A]")
768 check(br"[\X41]", br"[\X41]")
769 check(br"[\x410]", b"[A0]")
770 for b in range(256):
771 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200772 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200773 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200774
775 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200776 decode = codecs.escape_decode
777 self.assertRaises(ValueError, decode, br"\x")
778 self.assertRaises(ValueError, decode, br"[\x]")
779 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
780 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
781 self.assertRaises(ValueError, decode, br"\x0")
782 self.assertRaises(ValueError, decode, br"[\x0]")
783 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
784 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200785
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000786class RecodingTest(unittest.TestCase):
787 def test_recoding(self):
788 f = StringIO.StringIO()
789 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
790 f2.write(u"a")
791 f2.close()
792 # Python used to crash on this at exit because of a refcount
793 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000794
Martin v. Löwis2548c732003-04-18 10:39:54 +0000795# From RFC 3492
796punycode_testcases = [
797 # A Arabic (Egyptian):
798 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
799 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
800 "egbpdaj6bu4bxfgehfvwxn"),
801 # B Chinese (simplified):
802 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
803 "ihqwcrb4cv8a8dqg056pqjye"),
804 # C Chinese (traditional):
805 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
806 "ihqwctvzc91f659drss3x8bo0yb"),
807 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
808 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
809 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
810 u"\u0065\u0073\u006B\u0079",
811 "Proprostnemluvesky-uyb24dma41a"),
812 # E Hebrew:
813 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
814 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
815 u"\u05D1\u05E8\u05D9\u05EA",
816 "4dbcagdahymbxekheh6e0a7fei0b"),
817 # F Hindi (Devanagari):
818 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
819 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
820 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
821 u"\u0939\u0948\u0902",
822 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
823
824 #(G) Japanese (kanji and hiragana):
825 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
826 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
827 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
828
829 # (H) Korean (Hangul syllables):
830 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
831 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
832 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
833 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
834 "psd879ccm6fea98c"),
835
836 # (I) Russian (Cyrillic):
837 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
838 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
839 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
840 u"\u0438",
841 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
842
843 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
844 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
845 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
846 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
847 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
848 u"\u0061\u00F1\u006F\u006C",
849 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
850
851 # (K) Vietnamese:
852 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
853 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
854 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
855 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
856 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
857 u"\u0056\u0069\u1EC7\u0074",
858 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
859
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860 #(L) 3<nen>B<gumi><kinpachi><sensei>
861 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
862 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000863
Martin v. Löwis2548c732003-04-18 10:39:54 +0000864 # (M) <amuro><namie>-with-SUPER-MONKEYS
865 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
866 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
867 u"\u004F\u004E\u004B\u0045\u0059\u0053",
868 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
869
870 # (N) Hello-Another-Way-<sorezore><no><basho>
871 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
872 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
873 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
874 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
875
876 # (O) <hitotsu><yane><no><shita>2
877 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
878 "2-u9tlzr9756bt3uc0v"),
879
880 # (P) Maji<de>Koi<suru>5<byou><mae>
881 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
882 u"\u308B\u0035\u79D2\u524D",
883 "MajiKoi5-783gue6qz075azm5e"),
884
885 # (Q) <pafii>de<runba>
886 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
887 "de-jg4avhby1noc0d"),
888
889 # (R) <sono><supiido><de>
890 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
891 "d9juau41awczczp"),
892
893 # (S) -> $1.00 <-
894 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
895 u"\u003C\u002D",
896 "-> $1.00 <--")
897 ]
898
899for i in punycode_testcases:
900 if len(i)!=2:
901 print repr(i)
902
903class PunycodeTest(unittest.TestCase):
904 def test_encode(self):
905 for uni, puny in punycode_testcases:
906 # Need to convert both strings to lower case, since
907 # some of the extended encodings use upper case, but our
908 # code produces only lower case. Converting just puny to
909 # lower is also insufficient, since some of the input characters
910 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000911 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000912
913 def test_decode(self):
914 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000915 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000916
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000917class UnicodeInternalTest(unittest.TestCase):
918 def test_bug1251300(self):
919 # Decoding with unicode_internal used to not correctly handle "code
920 # points" above 0x10ffff on UCS-4 builds.
921 if sys.maxunicode > 0xffff:
922 ok = [
923 ("\x00\x10\xff\xff", u"\U0010ffff"),
924 ("\x00\x00\x01\x01", u"\U00000101"),
925 ("", u""),
926 ]
927 not_ok = [
928 "\x7f\xff\xff\xff",
929 "\x80\x00\x00\x00",
930 "\x81\x00\x00\x00",
931 "\x00",
932 "\x00\x00\x00\x00\x00",
933 ]
934 for internal, uni in ok:
935 if sys.byteorder == "little":
936 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000937 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000938 for internal in not_ok:
939 if sys.byteorder == "little":
940 internal = "".join(reversed(internal))
941 self.assertRaises(UnicodeDecodeError, internal.decode,
942 "unicode_internal")
943
944 def test_decode_error_attributes(self):
945 if sys.maxunicode > 0xffff:
946 try:
947 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
948 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000949 self.assertEqual("unicode_internal", ex.encoding)
950 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
951 self.assertEqual(4, ex.start)
952 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000953 else:
954 self.fail()
955
956 def test_decode_callback(self):
957 if sys.maxunicode > 0xffff:
958 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
959 decoder = codecs.getdecoder("unicode_internal")
960 ab = u"ab".encode("unicode_internal")
961 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
962 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000963 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000964
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000965 def test_encode_length(self):
966 # Issue 3739
967 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000968 self.assertEqual(encoder(u"a")[1], 1)
969 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000970
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000971 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000972 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000973
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
975nameprep_tests = [
976 # 3.1 Map to nothing.
977 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
978 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
979 '\xb8\x8f\xef\xbb\xbf',
980 'foobarbaz'),
981 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
982 ('CAFE',
983 'cafe'),
984 # 3.3 Case folding 8bit U+00DF (german sharp s).
985 # The original test case is bogus; it says \xc3\xdf
986 ('\xc3\x9f',
987 'ss'),
988 # 3.4 Case folding U+0130 (turkish capital I with dot).
989 ('\xc4\xb0',
990 'i\xcc\x87'),
991 # 3.5 Case folding multibyte U+0143 U+037A.
992 ('\xc5\x83\xcd\xba',
993 '\xc5\x84 \xce\xb9'),
994 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
995 # XXX: skip this as it fails in UCS-2 mode
996 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
997 # 'telc\xe2\x88\x95kg\xcf\x83'),
998 (None, None),
999 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1000 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1001 '\xc7\xb0 a'),
1002 # 3.8 Case folding U+1FB7 and normalization.
1003 ('\xe1\xbe\xb7',
1004 '\xe1\xbe\xb6\xce\xb9'),
1005 # 3.9 Self-reverting case folding U+01F0 and normalization.
1006 # The original test case is bogus, it says `\xc7\xf0'
1007 ('\xc7\xb0',
1008 '\xc7\xb0'),
1009 # 3.10 Self-reverting case folding U+0390 and normalization.
1010 ('\xce\x90',
1011 '\xce\x90'),
1012 # 3.11 Self-reverting case folding U+03B0 and normalization.
1013 ('\xce\xb0',
1014 '\xce\xb0'),
1015 # 3.12 Self-reverting case folding U+1E96 and normalization.
1016 ('\xe1\xba\x96',
1017 '\xe1\xba\x96'),
1018 # 3.13 Self-reverting case folding U+1F56 and normalization.
1019 ('\xe1\xbd\x96',
1020 '\xe1\xbd\x96'),
1021 # 3.14 ASCII space character U+0020.
1022 (' ',
1023 ' '),
1024 # 3.15 Non-ASCII 8bit space character U+00A0.
1025 ('\xc2\xa0',
1026 ' '),
1027 # 3.16 Non-ASCII multibyte space character U+1680.
1028 ('\xe1\x9a\x80',
1029 None),
1030 # 3.17 Non-ASCII multibyte space character U+2000.
1031 ('\xe2\x80\x80',
1032 ' '),
1033 # 3.18 Zero Width Space U+200b.
1034 ('\xe2\x80\x8b',
1035 ''),
1036 # 3.19 Non-ASCII multibyte space character U+3000.
1037 ('\xe3\x80\x80',
1038 ' '),
1039 # 3.20 ASCII control characters U+0010 U+007F.
1040 ('\x10\x7f',
1041 '\x10\x7f'),
1042 # 3.21 Non-ASCII 8bit control character U+0085.
1043 ('\xc2\x85',
1044 None),
1045 # 3.22 Non-ASCII multibyte control character U+180E.
1046 ('\xe1\xa0\x8e',
1047 None),
1048 # 3.23 Zero Width No-Break Space U+FEFF.
1049 ('\xef\xbb\xbf',
1050 ''),
1051 # 3.24 Non-ASCII control character U+1D175.
1052 ('\xf0\x9d\x85\xb5',
1053 None),
1054 # 3.25 Plane 0 private use character U+F123.
1055 ('\xef\x84\xa3',
1056 None),
1057 # 3.26 Plane 15 private use character U+F1234.
1058 ('\xf3\xb1\x88\xb4',
1059 None),
1060 # 3.27 Plane 16 private use character U+10F234.
1061 ('\xf4\x8f\x88\xb4',
1062 None),
1063 # 3.28 Non-character code point U+8FFFE.
1064 ('\xf2\x8f\xbf\xbe',
1065 None),
1066 # 3.29 Non-character code point U+10FFFF.
1067 ('\xf4\x8f\xbf\xbf',
1068 None),
1069 # 3.30 Surrogate code U+DF42.
1070 ('\xed\xbd\x82',
1071 None),
1072 # 3.31 Non-plain text character U+FFFD.
1073 ('\xef\xbf\xbd',
1074 None),
1075 # 3.32 Ideographic description character U+2FF5.
1076 ('\xe2\xbf\xb5',
1077 None),
1078 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001079 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 '\xcc\x81'),
1081 # 3.34 Left-to-right mark U+200E.
1082 ('\xe2\x80\x8e',
1083 None),
1084 # 3.35 Deprecated U+202A.
1085 ('\xe2\x80\xaa',
1086 None),
1087 # 3.36 Language tagging character U+E0001.
1088 ('\xf3\xa0\x80\x81',
1089 None),
1090 # 3.37 Language tagging character U+E0042.
1091 ('\xf3\xa0\x81\x82',
1092 None),
1093 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1094 ('foo\xd6\xbebar',
1095 None),
1096 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1097 ('foo\xef\xb5\x90bar',
1098 None),
1099 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1100 ('foo\xef\xb9\xb6bar',
1101 'foo \xd9\x8ebar'),
1102 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1103 ('\xd8\xa71',
1104 None),
1105 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1106 ('\xd8\xa71\xd8\xa8',
1107 '\xd8\xa71\xd8\xa8'),
1108 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001109 # Skip this test as we allow unassigned
1110 #('\xf3\xa0\x80\x82',
1111 # None),
1112 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # 3.44 Larger test (shrinking).
1114 # Original test case reads \xc3\xdf
1115 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1116 '\xaa\xce\xb0\xe2\x80\x80',
1117 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1118 # 3.45 Larger test (expanding).
1119 # Original test case reads \xc3\x9f
1120 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1121 '\x80',
1122 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1123 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1124 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1125 ]
1126
1127
1128class NameprepTest(unittest.TestCase):
1129 def test_nameprep(self):
1130 from encodings.idna import nameprep
1131 for pos, (orig, prepped) in enumerate(nameprep_tests):
1132 if orig is None:
1133 # Skipped
1134 continue
1135 # The Unicode strings are given in UTF-8
1136 orig = unicode(orig, "utf-8")
1137 if prepped is None:
1138 # Input contains prohibited characters
1139 self.assertRaises(UnicodeError, nameprep, orig)
1140 else:
1141 prepped = unicode(prepped, "utf-8")
1142 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001143 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144 except Exception,e:
1145 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1146
Walter Dörwald78a0be62006-04-14 18:25:39 +00001147class IDNACodecTest(unittest.TestCase):
1148 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001149 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1150 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1151 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1152 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001153
1154 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001155 self.assertEqual(u"python.org".encode("idna"), "python.org")
1156 self.assertEqual("python.org.".encode("idna"), "python.org.")
1157 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1158 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001159
Martin v. Löwis8b595142005-08-25 11:03:38 +00001160 def test_stream(self):
1161 import StringIO
1162 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1163 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001164 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001165
Walter Dörwald78a0be62006-04-14 18:25:39 +00001166 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001167 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001168 "".join(codecs.iterdecode("python.org", "idna")),
1169 u"python.org"
1170 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001171 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001172 "".join(codecs.iterdecode("python.org.", "idna")),
1173 u"python.org."
1174 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001175 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001176 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1177 u"pyth\xf6n.org."
1178 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001179 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001180 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1181 u"pyth\xf6n.org."
1182 )
1183
1184 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001185 self.assertEqual(decoder.decode("xn--xam", ), u"")
1186 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1187 self.assertEqual(decoder.decode(u"rg"), u"")
1188 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001189
1190 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001191 self.assertEqual(decoder.decode("xn--xam", ), u"")
1192 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1193 self.assertEqual(decoder.decode("rg."), u"org.")
1194 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001195
1196 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001197 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001198 "".join(codecs.iterencode(u"python.org", "idna")),
1199 "python.org"
1200 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001201 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001202 "".join(codecs.iterencode(u"python.org.", "idna")),
1203 "python.org."
1204 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001205 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001206 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1207 "xn--pythn-mua.org."
1208 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001209 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001210 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1211 "xn--pythn-mua.org."
1212 )
1213
1214 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001215 self.assertEqual(encoder.encode(u"\xe4x"), "")
1216 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1217 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001218
1219 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001220 self.assertEqual(encoder.encode(u"\xe4x"), "")
1221 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1222 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001223
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001224class CodecsModuleTest(unittest.TestCase):
1225
1226 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001228 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001229 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001230 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001231 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1232
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001233 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001234 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001235 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001236 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001237 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001238 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001239 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1240
1241 def test_register(self):
1242 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001243 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001244
1245 def test_lookup(self):
1246 self.assertRaises(TypeError, codecs.lookup)
1247 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001248 self.assertRaises(LookupError, codecs.lookup, " ")
1249
1250 def test_getencoder(self):
1251 self.assertRaises(TypeError, codecs.getencoder)
1252 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1253
1254 def test_getdecoder(self):
1255 self.assertRaises(TypeError, codecs.getdecoder)
1256 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1257
1258 def test_getreader(self):
1259 self.assertRaises(TypeError, codecs.getreader)
1260 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1261
1262 def test_getwriter(self):
1263 self.assertRaises(TypeError, codecs.getwriter)
1264 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001265
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001266 def test_lookup_issue1813(self):
1267 # Issue #1813: under Turkish locales, lookup of some codecs failed
1268 # because 'I' is lowercased as a dotless "i"
1269 oldlocale = locale.getlocale(locale.LC_CTYPE)
1270 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1271 try:
1272 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1273 except locale.Error:
1274 # Unsupported locale on this system
1275 self.skipTest('test needs Turkish locale')
1276 c = codecs.lookup('ASCII')
1277 self.assertEqual(c.name, 'ascii')
1278
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001279class StreamReaderTest(unittest.TestCase):
1280
1281 def setUp(self):
1282 self.reader = codecs.getreader('utf-8')
1283 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1284
1285 def test_readlines(self):
1286 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001287 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001288
Georg Brandl8f99f812006-10-29 08:39:22 +00001289class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001290
Georg Brandl8f99f812006-10-29 08:39:22 +00001291 def test_basic(self):
1292 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001293 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001294 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001295
1296 f = StringIO.StringIO()
1297 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1298 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001299 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001300
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001301class Str2StrTest(unittest.TestCase):
1302
1303 def test_read(self):
1304 sin = "\x80".encode("base64_codec")
1305 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1306 sout = reader.read()
1307 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001308 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001309
1310 def test_readline(self):
1311 sin = "\x80".encode("base64_codec")
1312 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1313 sout = reader.readline()
1314 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001315 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001316
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001317all_unicode_encodings = [
1318 "ascii",
1319 "base64_codec",
1320 "big5",
1321 "big5hkscs",
1322 "charmap",
1323 "cp037",
1324 "cp1006",
1325 "cp1026",
1326 "cp1140",
1327 "cp1250",
1328 "cp1251",
1329 "cp1252",
1330 "cp1253",
1331 "cp1254",
1332 "cp1255",
1333 "cp1256",
1334 "cp1257",
1335 "cp1258",
1336 "cp424",
1337 "cp437",
1338 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001339 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001340 "cp737",
1341 "cp775",
1342 "cp850",
1343 "cp852",
1344 "cp855",
1345 "cp856",
1346 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001347 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001348 "cp860",
1349 "cp861",
1350 "cp862",
1351 "cp863",
1352 "cp864",
1353 "cp865",
1354 "cp866",
1355 "cp869",
1356 "cp874",
1357 "cp875",
1358 "cp932",
1359 "cp949",
1360 "cp950",
1361 "euc_jis_2004",
1362 "euc_jisx0213",
1363 "euc_jp",
1364 "euc_kr",
1365 "gb18030",
1366 "gb2312",
1367 "gbk",
1368 "hex_codec",
1369 "hp_roman8",
1370 "hz",
1371 "idna",
1372 "iso2022_jp",
1373 "iso2022_jp_1",
1374 "iso2022_jp_2",
1375 "iso2022_jp_2004",
1376 "iso2022_jp_3",
1377 "iso2022_jp_ext",
1378 "iso2022_kr",
1379 "iso8859_1",
1380 "iso8859_10",
1381 "iso8859_11",
1382 "iso8859_13",
1383 "iso8859_14",
1384 "iso8859_15",
1385 "iso8859_16",
1386 "iso8859_2",
1387 "iso8859_3",
1388 "iso8859_4",
1389 "iso8859_5",
1390 "iso8859_6",
1391 "iso8859_7",
1392 "iso8859_8",
1393 "iso8859_9",
1394 "johab",
1395 "koi8_r",
1396 "koi8_u",
1397 "latin_1",
1398 "mac_cyrillic",
1399 "mac_greek",
1400 "mac_iceland",
1401 "mac_latin2",
1402 "mac_roman",
1403 "mac_turkish",
1404 "palmos",
1405 "ptcp154",
1406 "punycode",
1407 "raw_unicode_escape",
1408 "rot_13",
1409 "shift_jis",
1410 "shift_jis_2004",
1411 "shift_jisx0213",
1412 "tis_620",
1413 "unicode_escape",
1414 "unicode_internal",
1415 "utf_16",
1416 "utf_16_be",
1417 "utf_16_le",
1418 "utf_7",
1419 "utf_8",
1420]
1421
1422if hasattr(codecs, "mbcs_encode"):
1423 all_unicode_encodings.append("mbcs")
1424
1425# The following encodings work only with str, not unicode
1426all_string_encodings = [
1427 "quopri_codec",
1428 "string_escape",
1429 "uu_codec",
1430]
1431
1432# The following encoding is not tested, because it's not supposed
1433# to work:
1434# "undefined"
1435
1436# The following encodings don't work in stateful mode
1437broken_unicode_with_streams = [
1438 "base64_codec",
1439 "hex_codec",
1440 "punycode",
1441 "unicode_internal"
1442]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001443broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001444
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001445# The following encodings only support "strict" mode
1446only_strict_mode = [
1447 "idna",
1448 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001449 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001450]
1451
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001452try:
1453 import bz2
1454except ImportError:
1455 pass
1456else:
1457 all_unicode_encodings.append("bz2_codec")
1458 broken_unicode_with_streams.append("bz2_codec")
1459
1460try:
1461 import zlib
1462except ImportError:
1463 pass
1464else:
1465 all_unicode_encodings.append("zlib_codec")
1466 broken_unicode_with_streams.append("zlib_codec")
1467
1468class BasicUnicodeTest(unittest.TestCase):
1469 def test_basics(self):
1470 s = u"abc123" # all codecs should be able to encode these
1471 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001472 name = codecs.lookup(encoding).name
1473 if encoding.endswith("_codec"):
1474 name += "_codec"
1475 elif encoding == "latin_1":
1476 name = "latin_1"
1477 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001478 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001479 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001480 (chars, size) = codecs.getdecoder(encoding)(bytes)
1481 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1482
1483 if encoding not in broken_unicode_with_streams:
1484 # check stream reader/writer
1485 q = Queue()
1486 writer = codecs.getwriter(encoding)(q)
1487 encodedresult = ""
1488 for c in s:
1489 writer.write(c)
1490 encodedresult += q.read()
1491 q = Queue()
1492 reader = codecs.getreader(encoding)(q)
1493 decodedresult = u""
1494 for c in encodedresult:
1495 q.write(c)
1496 decodedresult += reader.read()
1497 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1498
Georg Brandl2c9838e2006-10-29 14:39:09 +00001499 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001500 # check incremental decoder/encoder (fetched via the Python
1501 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001502 try:
1503 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001504 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001505 except LookupError: # no IncrementalEncoder
1506 pass
1507 else:
1508 # check incremental decoder/encoder
1509 encodedresult = ""
1510 for c in s:
1511 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001512 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001513 decoder = codecs.getincrementaldecoder(encoding)()
1514 decodedresult = u""
1515 for c in encodedresult:
1516 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001517 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001518 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1519
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001520 # check C API
1521 encodedresult = ""
1522 for c in s:
1523 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001524 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001525 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1526 decodedresult = u""
1527 for c in encodedresult:
1528 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001529 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001530 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1531
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001532 # check iterencode()/iterdecode()
1533 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1534 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1535
1536 # check iterencode()/iterdecode() with empty string
1537 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1538 self.assertEqual(result, u"")
1539
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001540 if encoding not in only_strict_mode:
1541 # check incremental decoder/encoder with errors argument
1542 try:
1543 encoder = codecs.getincrementalencoder(encoding)("ignore")
1544 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1545 except LookupError: # no IncrementalEncoder
1546 pass
1547 else:
1548 encodedresult = "".join(encoder.encode(c) for c in s)
1549 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1550 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1551 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001552
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001553 encodedresult = "".join(cencoder.encode(c) for c in s)
1554 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1555 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1556 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1557
Walter Dörwald729c31f2005-03-14 19:06:30 +00001558 def test_seek(self):
1559 # all codecs should be able to encode these
1560 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1561 for encoding in all_unicode_encodings:
1562 if encoding == "idna": # FIXME: See SF bug #1163178
1563 continue
1564 if encoding in broken_unicode_with_streams:
1565 continue
1566 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1567 for t in xrange(5):
1568 # Test that calling seek resets the internal codec state and buffers
1569 reader.seek(0, 0)
1570 line = reader.readline()
1571 self.assertEqual(s[:len(line)], line)
1572
Walter Dörwalde22d3392005-11-17 08:52:34 +00001573 def test_bad_decode_args(self):
1574 for encoding in all_unicode_encodings:
1575 decoder = codecs.getdecoder(encoding)
1576 self.assertRaises(TypeError, decoder)
1577 if encoding not in ("idna", "punycode"):
1578 self.assertRaises(TypeError, decoder, 42)
1579
1580 def test_bad_encode_args(self):
1581 for encoding in all_unicode_encodings:
1582 encoder = codecs.getencoder(encoding)
1583 self.assertRaises(TypeError, encoder)
1584
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001585 def test_encoding_map_type_initialized(self):
1586 from encodings import cp1140
1587 # This used to crash, we are only verifying there's no crash.
1588 table_type = type(cp1140.encoding_table)
1589 self.assertEqual(table_type, table_type)
1590
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001591class BasicStrTest(unittest.TestCase):
1592 def test_basics(self):
1593 s = "abc123"
1594 for encoding in all_string_encodings:
1595 (bytes, size) = codecs.getencoder(encoding)(s)
1596 self.assertEqual(size, len(s))
1597 (chars, size) = codecs.getdecoder(encoding)(bytes)
1598 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1599
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001600class CharmapTest(unittest.TestCase):
1601 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001602 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001603 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1604 (u"abc", 3)
1605 )
1606
Serhiy Storchaka95997452013-01-15 14:42:59 +02001607 self.assertRaises(UnicodeDecodeError,
1608 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1609 )
1610
1611 self.assertRaises(UnicodeDecodeError,
1612 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1613 )
1614
Ezio Melotti2623a372010-11-21 13:34:58 +00001615 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001616 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1617 (u"ab\ufffd", 3)
1618 )
1619
Ezio Melotti2623a372010-11-21 13:34:58 +00001620 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001621 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1622 (u"ab\ufffd", 3)
1623 )
1624
Ezio Melotti2623a372010-11-21 13:34:58 +00001625 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001626 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1627 (u"ab", 3)
1628 )
1629
Ezio Melotti2623a372010-11-21 13:34:58 +00001630 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001631 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1632 (u"ab", 3)
1633 )
1634
1635 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001636 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001637 codecs.charmap_decode(allbytes, "ignore", u""),
1638 (u"", len(allbytes))
1639 )
1640
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001641 def test_decode_with_int2str_map(self):
1642 self.assertEqual(
1643 codecs.charmap_decode("\x00\x01\x02", "strict",
1644 {0: u'a', 1: u'b', 2: u'c'}),
1645 (u"abc", 3)
1646 )
1647
1648 self.assertEqual(
1649 codecs.charmap_decode("\x00\x01\x02", "strict",
1650 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1651 (u"AaBbCc", 3)
1652 )
1653
1654 self.assertEqual(
1655 codecs.charmap_decode("\x00\x01\x02", "strict",
1656 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1657 (u"\U0010FFFFbc", 3)
1658 )
1659
1660 self.assertEqual(
1661 codecs.charmap_decode("\x00\x01\x02", "strict",
1662 {0: u'a', 1: u'b', 2: u''}),
1663 (u"ab", 3)
1664 )
1665
1666 self.assertRaises(UnicodeDecodeError,
1667 codecs.charmap_decode, "\x00\x01\x02", "strict",
1668 {0: u'a', 1: u'b'}
1669 )
1670
Serhiy Storchaka95997452013-01-15 14:42:59 +02001671 self.assertRaises(UnicodeDecodeError,
1672 codecs.charmap_decode, "\x00\x01\x02", "strict",
1673 {0: u'a', 1: u'b', 2: None}
1674 )
1675
1676 # Issue #14850
1677 self.assertRaises(UnicodeDecodeError,
1678 codecs.charmap_decode, "\x00\x01\x02", "strict",
1679 {0: u'a', 1: u'b', 2: u'\ufffe'}
1680 )
1681
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001682 self.assertEqual(
1683 codecs.charmap_decode("\x00\x01\x02", "replace",
1684 {0: u'a', 1: u'b'}),
1685 (u"ab\ufffd", 3)
1686 )
1687
1688 self.assertEqual(
1689 codecs.charmap_decode("\x00\x01\x02", "replace",
1690 {0: u'a', 1: u'b', 2: None}),
1691 (u"ab\ufffd", 3)
1692 )
1693
Serhiy Storchaka95997452013-01-15 14:42:59 +02001694 # Issue #14850
1695 self.assertEqual(
1696 codecs.charmap_decode("\x00\x01\x02", "replace",
1697 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1698 (u"ab\ufffd", 3)
1699 )
1700
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001701 self.assertEqual(
1702 codecs.charmap_decode("\x00\x01\x02", "ignore",
1703 {0: u'a', 1: u'b'}),
1704 (u"ab", 3)
1705 )
1706
1707 self.assertEqual(
1708 codecs.charmap_decode("\x00\x01\x02", "ignore",
1709 {0: u'a', 1: u'b', 2: None}),
1710 (u"ab", 3)
1711 )
1712
Serhiy Storchaka95997452013-01-15 14:42:59 +02001713 # Issue #14850
1714 self.assertEqual(
1715 codecs.charmap_decode("\x00\x01\x02", "ignore",
1716 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1717 (u"ab", 3)
1718 )
1719
1720 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001721 self.assertEqual(
1722 codecs.charmap_decode(allbytes, "ignore", {}),
1723 (u"", len(allbytes))
1724 )
1725
1726 def test_decode_with_int2int_map(self):
1727 a = ord(u'a')
1728 b = ord(u'b')
1729 c = ord(u'c')
1730
1731 self.assertEqual(
1732 codecs.charmap_decode("\x00\x01\x02", "strict",
1733 {0: a, 1: b, 2: c}),
1734 (u"abc", 3)
1735 )
1736
1737 # Issue #15379
1738 self.assertEqual(
1739 codecs.charmap_decode("\x00\x01\x02", "strict",
1740 {0: 0x10FFFF, 1: b, 2: c}),
1741 (u"\U0010FFFFbc", 3)
1742 )
1743
1744 self.assertRaises(TypeError,
1745 codecs.charmap_decode, "\x00\x01\x02", "strict",
1746 {0: 0x110000, 1: b, 2: c}
1747 )
1748
1749 self.assertRaises(UnicodeDecodeError,
1750 codecs.charmap_decode, "\x00\x01\x02", "strict",
1751 {0: a, 1: b},
1752 )
1753
Serhiy Storchaka95997452013-01-15 14:42:59 +02001754 self.assertRaises(UnicodeDecodeError,
1755 codecs.charmap_decode, "\x00\x01\x02", "strict",
1756 {0: a, 1: b, 2: 0xFFFE},
1757 )
1758
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001759 self.assertEqual(
1760 codecs.charmap_decode("\x00\x01\x02", "replace",
1761 {0: a, 1: b}),
1762 (u"ab\ufffd", 3)
1763 )
1764
1765 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001766 codecs.charmap_decode("\x00\x01\x02", "replace",
1767 {0: a, 1: b, 2: 0xFFFE}),
1768 (u"ab\ufffd", 3)
1769 )
1770
1771 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001772 codecs.charmap_decode("\x00\x01\x02", "ignore",
1773 {0: a, 1: b}),
1774 (u"ab", 3)
1775 )
1776
Serhiy Storchaka95997452013-01-15 14:42:59 +02001777 self.assertEqual(
1778 codecs.charmap_decode("\x00\x01\x02", "ignore",
1779 {0: a, 1: b, 2: 0xFFFE}),
1780 (u"ab", 3)
1781 )
1782
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001783
Georg Brandl8f99f812006-10-29 08:39:22 +00001784class WithStmtTest(unittest.TestCase):
1785 def test_encodedfile(self):
1786 f = StringIO.StringIO("\xc3\xbc")
1787 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001788 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001789
1790 def test_streamreaderwriter(self):
1791 f = StringIO.StringIO("\xc3\xbc")
1792 info = codecs.lookup("utf-8")
1793 with codecs.StreamReaderWriter(f, info.streamreader,
1794 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001795 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001796
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001797
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001798class UnicodeEscapeTest(unittest.TestCase):
1799 def test_empty(self):
1800 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1801 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1802
1803 def test_raw_encode(self):
1804 encode = codecs.unicode_escape_encode
1805 for b in range(32, 127):
1806 if b != ord('\\'):
1807 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1808
1809 def test_raw_decode(self):
1810 decode = codecs.unicode_escape_decode
1811 for b in range(256):
1812 if b != ord('\\'):
1813 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1814
1815 def test_escape_encode(self):
1816 encode = codecs.unicode_escape_encode
1817 check = coding_checker(self, encode)
1818 check(u'\t', r'\t')
1819 check(u'\n', r'\n')
1820 check(u'\r', r'\r')
1821 check(u'\\', r'\\')
1822 for b in range(32):
1823 if chr(b) not in '\t\n\r':
1824 check(unichr(b), '\\x%02x' % b)
1825 for b in range(127, 256):
1826 check(unichr(b), '\\x%02x' % b)
1827 check(u'\u20ac', r'\u20ac')
1828 check(u'\U0001d120', r'\U0001d120')
1829
1830 def test_escape_decode(self):
1831 decode = codecs.unicode_escape_decode
1832 check = coding_checker(self, decode)
1833 check("[\\\n]", u"[]")
1834 check(r'[\"]', u'["]')
1835 check(r"[\']", u"[']")
1836 check(r"[\\]", ur"[\]")
1837 check(r"[\a]", u"[\x07]")
1838 check(r"[\b]", u"[\x08]")
1839 check(r"[\t]", u"[\x09]")
1840 check(r"[\n]", u"[\x0a]")
1841 check(r"[\v]", u"[\x0b]")
1842 check(r"[\f]", u"[\x0c]")
1843 check(r"[\r]", u"[\x0d]")
1844 check(r"[\7]", u"[\x07]")
1845 check(r"[\8]", ur"[\8]")
1846 check(r"[\78]", u"[\x078]")
1847 check(r"[\41]", u"[!]")
1848 check(r"[\418]", u"[!8]")
1849 check(r"[\101]", u"[A]")
1850 check(r"[\1010]", u"[A0]")
1851 check(r"[\x41]", u"[A]")
1852 check(r"[\x410]", u"[A0]")
1853 check(r"\u20ac", u"\u20ac")
1854 check(r"\U0001d120", u"\U0001d120")
1855 for b in range(256):
1856 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1857 check('\\' + chr(b), u'\\' + unichr(b))
1858
1859 def test_decode_errors(self):
1860 decode = codecs.unicode_escape_decode
1861 for c, d in ('x', 2), ('u', 4), ('U', 4):
1862 for i in range(d):
1863 self.assertRaises(UnicodeDecodeError, decode,
1864 "\\" + c + "0"*i)
1865 self.assertRaises(UnicodeDecodeError, decode,
1866 "[\\" + c + "0"*i + "]")
1867 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1868 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1869 self.assertEqual(decode(data, "replace"),
1870 (u"[\ufffd]\ufffd", len(data)))
1871 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1872 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1873 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1874
1875
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001876class RawUnicodeEscapeTest(unittest.TestCase):
1877 def test_empty(self):
1878 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1879 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1880
1881 def test_raw_encode(self):
1882 encode = codecs.raw_unicode_escape_encode
1883 for b in range(256):
1884 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1885
1886 def test_raw_decode(self):
1887 decode = codecs.raw_unicode_escape_decode
1888 for b in range(256):
1889 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1890
1891 def test_escape_encode(self):
1892 encode = codecs.raw_unicode_escape_encode
1893 check = coding_checker(self, encode)
1894 for b in range(256):
1895 if chr(b) not in 'uU':
1896 check(u'\\' + unichr(b), '\\' + chr(b))
1897 check(u'\u20ac', r'\u20ac')
1898 check(u'\U0001d120', r'\U0001d120')
1899
1900 def test_escape_decode(self):
1901 decode = codecs.raw_unicode_escape_decode
1902 check = coding_checker(self, decode)
1903 for b in range(256):
1904 if chr(b) not in 'uU':
1905 check('\\' + chr(b), u'\\' + unichr(b))
1906 check(r"\u20ac", u"\u20ac")
1907 check(r"\U0001d120", u"\U0001d120")
1908
1909 def test_decode_errors(self):
1910 decode = codecs.raw_unicode_escape_decode
1911 for c, d in ('u', 4), ('U', 4):
1912 for i in range(d):
1913 self.assertRaises(UnicodeDecodeError, decode,
1914 "\\" + c + "0"*i)
1915 self.assertRaises(UnicodeDecodeError, decode,
1916 "[\\" + c + "0"*i + "]")
1917 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1918 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1919 self.assertEqual(decode(data, "replace"),
1920 (u"[\ufffd]\ufffd", len(data)))
1921 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1922 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1923 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1924
1925
Victor Stinner262be5e2010-05-22 02:11:07 +00001926class BomTest(unittest.TestCase):
1927 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001928 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001929 tests = ("utf-16",
1930 "utf-16-le",
1931 "utf-16-be",
1932 "utf-32",
1933 "utf-32-le",
1934 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001935 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001936 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001937 # Check if the BOM is written only once
1938 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001939 f.write(data)
1940 f.write(data)
1941 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001942 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001943 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001944 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001945
Victor Stinner7df55da2010-05-22 13:37:56 +00001946 # Check that the BOM is written after a seek(0)
1947 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1948 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001949 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001950 f.seek(0)
1951 f.write(data)
1952 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001953 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001954
1955 # (StreamWriter) Check that the BOM is written after a seek(0)
1956 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1957 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001958 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001959 f.writer.seek(0)
1960 f.writer.write(data)
1961 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001962 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001963
1964 # Check that the BOM is not written after a seek() at a position
1965 # different than the start
1966 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1967 f.write(data)
1968 f.seek(f.tell())
1969 f.write(data)
1970 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001971 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001972
1973 # (StreamWriter) Check that the BOM is not written after a seek()
1974 # at a position different than the start
1975 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1976 f.writer.write(data)
1977 f.writer.seek(f.writer.tell())
1978 f.writer.write(data)
1979 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001980 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001981
Victor Stinner262be5e2010-05-22 02:11:07 +00001982
Fred Drake2e2be372001-09-20 21:33:42 +00001983def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001984 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001985 UTF32Test,
1986 UTF32LETest,
1987 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001988 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001989 UTF16LETest,
1990 UTF16BETest,
1991 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001992 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001993 UTF7Test,
1994 UTF16ExTest,
1995 ReadBufferTest,
1996 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001997 EscapeDecodeTest,
1998 RecodingTest,
1999 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002000 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002001 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002002 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002003 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002004 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002005 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002006 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002007 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002008 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002009 CharmapTest,
2010 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002011 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002012 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002013 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002014 )
Fred Drake2e2be372001-09-20 21:33:42 +00002015
2016
2017if __name__ == "__main__":
2018 test_main()