blob: 2359917769c69958e6877533607eec45ba11d97e [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
100 # Test long lines (multiple calls to read() in readline())
101 vw = []
102 vwo = []
103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
104 vw.append((i*200)*u"\3042" + lineend)
105 vwo.append((i*200)*u"\3042")
106 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
108
109 # Test lines where the first read might end with \r, so the
110 # reader has to look ahead whether this is a lone \r or a \r\n
111 for size in xrange(80):
112 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000113 s = 10*(size*u"a" + lineend + u"xxx\n")
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=True),
118 size*u"a" + lineend,
119 )
120 reader = getreader(s)
121 for i in xrange(10):
122 self.assertEqual(
123 reader.readline(keepends=False),
124 size*u"a",
125 )
126
127 def test_bug1175396(self):
128 s = [
129 '<%!--===================================================\r\n',
130 ' BLOG index page: show recent articles,\r\n',
131 ' today\'s articles, or articles of a specific date.\r\n',
132 '========================================================--%>\r\n',
133 '<%@inputencoding="ISO-8859-1"%>\r\n',
134 '<%@pagetemplate=TEMPLATE.y%>\r\n',
135 '<%@import=import frog.util, frog%>\r\n',
136 '<%@import=import frog.objects%>\r\n',
137 '<%@import=from frog.storageerrors import StorageError%>\r\n',
138 '<%\r\n',
139 '\r\n',
140 'import logging\r\n',
141 'log=logging.getLogger("Snakelets.logger")\r\n',
142 '\r\n',
143 '\r\n',
144 'user=self.SessionCtx.user\r\n',
145 'storageEngine=self.SessionCtx.storageEngine\r\n',
146 '\r\n',
147 '\r\n',
148 'def readArticlesFromDate(date, count=None):\r\n',
149 ' entryids=storageEngine.listBlogEntries(date)\r\n',
150 ' entryids.reverse() # descending\r\n',
151 ' if count:\r\n',
152 ' entryids=entryids[:count]\r\n',
153 ' try:\r\n',
154 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
155 ' except StorageError,x:\r\n',
156 ' log.error("Error loading articles: "+str(x))\r\n',
157 ' self.abort("cannot load articles")\r\n',
158 '\r\n',
159 'showdate=None\r\n',
160 '\r\n',
161 'arg=self.Request.getArg()\r\n',
162 'if arg=="today":\r\n',
163 ' #-------------------- TODAY\'S ARTICLES\r\n',
164 ' self.write("<h2>Today\'s articles</h2>")\r\n',
165 ' showdate = frog.util.isodatestr() \r\n',
166 ' entries = readArticlesFromDate(showdate)\r\n',
167 'elif arg=="active":\r\n',
168 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
169 ' self.Yredirect("active.y")\r\n',
170 'elif arg=="login":\r\n',
171 ' #-------------------- LOGIN PAGE redirect\r\n',
172 ' self.Yredirect("login.y")\r\n',
173 'elif arg=="date":\r\n',
174 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
175 ' showdate = self.Request.getParameter("date")\r\n',
176 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
177 ' entries = readArticlesFromDate(showdate)\r\n',
178 'else:\r\n',
179 ' #-------------------- RECENT ARTICLES\r\n',
180 ' self.write("<h2>Recent articles</h2>")\r\n',
181 ' dates=storageEngine.listBlogEntryDates()\r\n',
182 ' if dates:\r\n',
183 ' entries=[]\r\n',
184 ' SHOWAMOUNT=10\r\n',
185 ' for showdate in dates:\r\n',
186 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
187 ' if len(entries)>=SHOWAMOUNT:\r\n',
188 ' break\r\n',
189 ' \r\n',
190 ]
191 stream = StringIO.StringIO("".join(s).encode(self.encoding))
192 reader = codecs.getreader(self.encoding)(stream)
193 for (i, line) in enumerate(reader):
194 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000195
196 def test_readlinequeue(self):
197 q = Queue()
198 writer = codecs.getwriter(self.encoding)(q)
199 reader = codecs.getreader(self.encoding)(q)
200
201 # No lineends
202 writer.write(u"foo\r")
203 self.assertEqual(reader.readline(keepends=False), u"foo")
204 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000205 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000206 self.assertEqual(reader.readline(keepends=False), u"bar")
207 writer.write(u"baz")
208 self.assertEqual(reader.readline(keepends=False), u"baz")
209 self.assertEqual(reader.readline(keepends=False), u"")
210
211 # Lineends
212 writer.write(u"foo\r")
213 self.assertEqual(reader.readline(keepends=True), u"foo\r")
214 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000215 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216 self.assertEqual(reader.readline(keepends=True), u"bar\r")
217 writer.write(u"baz")
218 self.assertEqual(reader.readline(keepends=True), u"baz")
219 self.assertEqual(reader.readline(keepends=True), u"")
220 writer.write(u"foo\r\n")
221 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
222
Walter Dörwald9fa09462005-01-10 12:01:39 +0000223 def test_bug1098990_a(self):
224 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
225 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
226 s3 = u"next line.\r\n"
227
228 s = (s1+s2+s3).encode(self.encoding)
229 stream = StringIO.StringIO(s)
230 reader = codecs.getreader(self.encoding)(stream)
231 self.assertEqual(reader.readline(), s1)
232 self.assertEqual(reader.readline(), s2)
233 self.assertEqual(reader.readline(), s3)
234 self.assertEqual(reader.readline(), u"")
235
236 def test_bug1098990_b(self):
237 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
238 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
239 s3 = u"stillokay:bbbbxx\r\n"
240 s4 = u"broken!!!!badbad\r\n"
241 s5 = u"againokay.\r\n"
242
243 s = (s1+s2+s3+s4+s5).encode(self.encoding)
244 stream = StringIO.StringIO(s)
245 reader = codecs.getreader(self.encoding)(stream)
246 self.assertEqual(reader.readline(), s1)
247 self.assertEqual(reader.readline(), s2)
248 self.assertEqual(reader.readline(), s3)
249 self.assertEqual(reader.readline(), s4)
250 self.assertEqual(reader.readline(), s5)
251 self.assertEqual(reader.readline(), u"")
252
Walter Dörwald6e390802007-08-17 16:41:28 +0000253class UTF32Test(ReadTest):
254 encoding = "utf-32"
255
256 spamle = ('\xff\xfe\x00\x00'
257 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
258 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
259 spambe = ('\x00\x00\xfe\xff'
260 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
261 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
262
263 def test_only_one_bom(self):
264 _,_,reader,writer = codecs.lookup(self.encoding)
265 # encode some stream
266 s = StringIO.StringIO()
267 f = writer(s)
268 f.write(u"spam")
269 f.write(u"spam")
270 d = s.getvalue()
271 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000272 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000273 # try to read it back
274 s = StringIO.StringIO(d)
275 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000276 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000277
278 def test_badbom(self):
279 s = StringIO.StringIO(4*"\xff")
280 f = codecs.getreader(self.encoding)(s)
281 self.assertRaises(UnicodeError, f.read)
282
283 s = StringIO.StringIO(8*"\xff")
284 f = codecs.getreader(self.encoding)(s)
285 self.assertRaises(UnicodeError, f.read)
286
287 def test_partial(self):
288 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200289 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000290 [
291 u"", # first byte of BOM read
292 u"", # second byte of BOM read
293 u"", # third byte of BOM read
294 u"", # fourth byte of BOM read => byteorder known
295 u"",
296 u"",
297 u"",
298 u"\x00",
299 u"\x00",
300 u"\x00",
301 u"\x00",
302 u"\x00\xff",
303 u"\x00\xff",
304 u"\x00\xff",
305 u"\x00\xff",
306 u"\x00\xff\u0100",
307 u"\x00\xff\u0100",
308 u"\x00\xff\u0100",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200311 u"\x00\xff\u0100\uffff",
312 u"\x00\xff\u0100\uffff",
313 u"\x00\xff\u0100\uffff",
314 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000315 ]
316 )
317
Georg Brandle9741f32009-09-17 11:28:09 +0000318 def test_handlers(self):
319 self.assertEqual((u'\ufffd', 1),
320 codecs.utf_32_decode('\x01', 'replace', True))
321 self.assertEqual((u'', 1),
322 codecs.utf_32_decode('\x01', 'ignore', True))
323
Walter Dörwald6e390802007-08-17 16:41:28 +0000324 def test_errors(self):
325 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
326 "\xff", "strict", True)
327
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000328 def test_issue8941(self):
329 # Issue #8941: insufficient result allocation when decoding into
330 # surrogate pairs on UCS-2 builds.
331 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
332 self.assertEqual(u'\U00010000' * 1024,
333 codecs.utf_32_decode(encoded_le)[0])
334 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
335 self.assertEqual(u'\U00010000' * 1024,
336 codecs.utf_32_decode(encoded_be)[0])
337
Walter Dörwald6e390802007-08-17 16:41:28 +0000338class UTF32LETest(ReadTest):
339 encoding = "utf-32-le"
340
341 def test_partial(self):
342 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200343 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000344 [
345 u"",
346 u"",
347 u"",
348 u"\x00",
349 u"\x00",
350 u"\x00",
351 u"\x00",
352 u"\x00\xff",
353 u"\x00\xff",
354 u"\x00\xff",
355 u"\x00\xff",
356 u"\x00\xff\u0100",
357 u"\x00\xff\u0100",
358 u"\x00\xff\u0100",
359 u"\x00\xff\u0100",
360 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200361 u"\x00\xff\u0100\uffff",
362 u"\x00\xff\u0100\uffff",
363 u"\x00\xff\u0100\uffff",
364 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000365 ]
366 )
367
368 def test_simple(self):
369 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
370
371 def test_errors(self):
372 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
373 "\xff", "strict", True)
374
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000375 def test_issue8941(self):
376 # Issue #8941: insufficient result allocation when decoding into
377 # surrogate pairs on UCS-2 builds.
378 encoded = '\x00\x00\x01\x00' * 1024
379 self.assertEqual(u'\U00010000' * 1024,
380 codecs.utf_32_le_decode(encoded)[0])
381
Walter Dörwald6e390802007-08-17 16:41:28 +0000382class UTF32BETest(ReadTest):
383 encoding = "utf-32-be"
384
385 def test_partial(self):
386 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200387 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000388 [
389 u"",
390 u"",
391 u"",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100",
404 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff",
408 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000409 ]
410 )
411
412 def test_simple(self):
413 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
414
415 def test_errors(self):
416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
417 "\xff", "strict", True)
418
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000419 def test_issue8941(self):
420 # Issue #8941: insufficient result allocation when decoding into
421 # surrogate pairs on UCS-2 builds.
422 encoded = '\x00\x01\x00\x00' * 1024
423 self.assertEqual(u'\U00010000' * 1024,
424 codecs.utf_32_be_decode(encoded)[0])
425
426
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000427class UTF16Test(ReadTest):
428 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000429
430 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
431 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
432
433 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000434 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000435 # encode some stream
436 s = StringIO.StringIO()
437 f = writer(s)
438 f.write(u"spam")
439 f.write(u"spam")
440 d = s.getvalue()
441 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000442 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000443 # try to read it back
444 s = StringIO.StringIO(d)
445 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000446 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000447
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000448 def test_badbom(self):
449 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000450 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000451 self.assertRaises(UnicodeError, f.read)
452
453 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000454 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000455 self.assertRaises(UnicodeError, f.read)
456
Walter Dörwald69652032004-09-07 20:24:22 +0000457 def test_partial(self):
458 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200459 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000460 [
461 u"", # first byte of BOM read
462 u"", # second byte of BOM read => byteorder known
463 u"",
464 u"\x00",
465 u"\x00",
466 u"\x00\xff",
467 u"\x00\xff",
468 u"\x00\xff\u0100",
469 u"\x00\xff\u0100",
470 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200471 u"\x00\xff\u0100\uffff",
472 u"\x00\xff\u0100\uffff",
473 u"\x00\xff\u0100\uffff",
474 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000475 ]
476 )
477
Georg Brandle9741f32009-09-17 11:28:09 +0000478 def test_handlers(self):
479 self.assertEqual((u'\ufffd', 1),
480 codecs.utf_16_decode('\x01', 'replace', True))
481 self.assertEqual((u'', 1),
482 codecs.utf_16_decode('\x01', 'ignore', True))
483
Walter Dörwalde22d3392005-11-17 08:52:34 +0000484 def test_errors(self):
485 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
486
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000487 def test_bug691291(self):
488 # Files are always opened in binary mode, even if no binary mode was
489 # specified. This means that no automatic conversion of '\n' is done
490 # on reading and writing.
491 s1 = u'Hello\r\nworld\r\n'
492
493 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200494 self.addCleanup(test_support.unlink, test_support.TESTFN)
495 with open(test_support.TESTFN, 'wb') as fp:
496 fp.write(s)
497 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
498 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000499
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500class UTF16LETest(ReadTest):
501 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000502
503 def test_partial(self):
504 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200505 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000506 [
507 u"",
508 u"\x00",
509 u"\x00",
510 u"\x00\xff",
511 u"\x00\xff",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff",
518 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000519 ]
520 )
521
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200523 tests = [
524 (b'\xff', u'\ufffd'),
525 (b'A\x00Z', u'A\ufffd'),
526 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
527 (b'\x00\xd8', u'\ufffd'),
528 (b'\x00\xd8A', u'\ufffd'),
529 (b'\x00\xd8A\x00', u'\ufffdA'),
530 (b'\x00\xdcA\x00', u'\ufffdA'),
531 ]
532 for raw, expected in tests:
533 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
534 raw, 'strict', True)
535 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000536
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000537class UTF16BETest(ReadTest):
538 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000539
540 def test_partial(self):
541 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200542 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000543 [
544 u"",
545 u"\x00",
546 u"\x00",
547 u"\x00\xff",
548 u"\x00\xff",
549 u"\x00\xff\u0100",
550 u"\x00\xff\u0100",
551 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200552 u"\x00\xff\u0100\uffff",
553 u"\x00\xff\u0100\uffff",
554 u"\x00\xff\u0100\uffff",
555 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200560 tests = [
561 (b'\xff', u'\ufffd'),
562 (b'\x00A\xff', u'A\ufffd'),
563 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
564 (b'\xd8\x00', u'\ufffd'),
565 (b'\xd8\x00\xdc', u'\ufffd'),
566 (b'\xd8\x00\x00A', u'\ufffdA'),
567 (b'\xdc\x00\x00A', u'\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574class UTF8Test(ReadTest):
575 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000576
577 def test_partial(self):
578 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200579 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000580 [
581 u"\x00",
582 u"\x00",
583 u"\x00\xff",
584 u"\x00\xff",
585 u"\x00\xff\u07ff",
586 u"\x00\xff\u07ff",
587 u"\x00\xff\u07ff",
588 u"\x00\xff\u07ff\u0800",
589 u"\x00\xff\u07ff\u0800",
590 u"\x00\xff\u07ff\u0800",
591 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200592 u"\x00\xff\u07ff\u0800\uffff",
593 u"\x00\xff\u07ff\u0800\uffff",
594 u"\x00\xff\u07ff\u0800\uffff",
595 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000596 ]
597 )
598
Walter Dörwalde22d3392005-11-17 08:52:34 +0000599class UTF7Test(ReadTest):
600 encoding = "utf-7"
601
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000602 def test_partial(self):
603 self.check_partial(
604 u"a+-b",
605 [
606 u"a",
607 u"a",
608 u"a+",
609 u"a+-",
610 u"a+-b",
611 ]
612 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000613
614class UTF16ExTest(unittest.TestCase):
615
616 def test_errors(self):
617 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
618
619 def test_bad_args(self):
620 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
621
622class ReadBufferTest(unittest.TestCase):
623
624 def test_array(self):
625 import array
626 self.assertEqual(
627 codecs.readbuffer_encode(array.array("c", "spam")),
628 ("spam", 4)
629 )
630
631 def test_empty(self):
632 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
633
634 def test_bad_args(self):
635 self.assertRaises(TypeError, codecs.readbuffer_encode)
636 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
637
638class CharBufferTest(unittest.TestCase):
639
640 def test_string(self):
641 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
642
643 def test_empty(self):
644 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
645
646 def test_bad_args(self):
647 self.assertRaises(TypeError, codecs.charbuffer_encode)
648 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
649
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000650class UTF8SigTest(ReadTest):
651 encoding = "utf-8-sig"
652
653 def test_partial(self):
654 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200655 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000656 [
657 u"",
658 u"",
659 u"", # First BOM has been read and skipped
660 u"",
661 u"",
662 u"\ufeff", # Second BOM has been read and emitted
663 u"\ufeff\x00", # "\x00" read and emitted
664 u"\ufeff\x00", # First byte of encoded u"\xff" read
665 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
666 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
667 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
668 u"\ufeff\x00\xff\u07ff",
669 u"\ufeff\x00\xff\u07ff",
670 u"\ufeff\x00\xff\u07ff\u0800",
671 u"\ufeff\x00\xff\u07ff\u0800",
672 u"\ufeff\x00\xff\u07ff\u0800",
673 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200674 u"\ufeff\x00\xff\u07ff\u0800\uffff",
675 u"\ufeff\x00\xff\u07ff\u0800\uffff",
676 u"\ufeff\x00\xff\u07ff\u0800\uffff",
677 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000678 ]
679 )
680
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000681 def test_bug1601501(self):
682 # SF bug #1601501: check that the codec works with a buffer
683 unicode("\xef\xbb\xbf", "utf-8-sig")
684
Walter Dörwald42348272007-04-12 10:35:00 +0000685 def test_bom(self):
686 d = codecs.getincrementaldecoder("utf-8-sig")()
687 s = u"spam"
688 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
689
Walter Dörwald183744d2007-11-19 12:41:10 +0000690 def test_stream_bom(self):
691 unistring = u"ABC\u00A1\u2200XYZ"
692 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
693
694 reader = codecs.getreader("utf-8-sig")
695 for sizehint in [None] + range(1, 11) + \
696 [64, 128, 256, 512, 1024]:
697 istream = reader(StringIO.StringIO(bytestring))
698 ostream = StringIO.StringIO()
699 while 1:
700 if sizehint is not None:
701 data = istream.read(sizehint)
702 else:
703 data = istream.read()
704
705 if not data:
706 break
707 ostream.write(data)
708
709 got = ostream.getvalue()
710 self.assertEqual(got, unistring)
711
712 def test_stream_bare(self):
713 unistring = u"ABC\u00A1\u2200XYZ"
714 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
715
716 reader = codecs.getreader("utf-8-sig")
717 for sizehint in [None] + range(1, 11) + \
718 [64, 128, 256, 512, 1024]:
719 istream = reader(StringIO.StringIO(bytestring))
720 ostream = StringIO.StringIO()
721 while 1:
722 if sizehint is not None:
723 data = istream.read(sizehint)
724 else:
725 data = istream.read()
726
727 if not data:
728 break
729 ostream.write(data)
730
731 got = ostream.getvalue()
732 self.assertEqual(got, unistring)
733
Walter Dörwald8709a422002-09-03 13:53:40 +0000734class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000735 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000736 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000737
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200738 def test_raw(self):
739 for b in ''.join(map(chr, range(256))):
740 if b != '\\':
741 self.assertEqual(codecs.escape_decode(b + '0'),
742 (b + '0', 2))
743
744 def test_escape(self):
745 self.assertEqual(codecs.escape_decode(b"[\\\n]"), (b"[]", 4))
746 self.assertEqual(codecs.escape_decode(br'[\"]'), (b'["]', 4))
747 self.assertEqual(codecs.escape_decode(br"[\']"), (b"[']", 4))
748 self.assertEqual(codecs.escape_decode(br"[\\]"), (br"[\]", 4))
749 self.assertEqual(codecs.escape_decode(br"[\a]"), (b"[\x07]", 4))
750 self.assertEqual(codecs.escape_decode(br"[\b]"), (b"[\x08]", 4))
751 self.assertEqual(codecs.escape_decode(br"[\t]"), (b"[\x09]", 4))
752 self.assertEqual(codecs.escape_decode(br"[\n]"), (b"[\x0a]", 4))
753 self.assertEqual(codecs.escape_decode(br"[\v]"), (b"[\x0b]", 4))
754 self.assertEqual(codecs.escape_decode(br"[\f]"), (b"[\x0c]", 4))
755 self.assertEqual(codecs.escape_decode(br"[\r]"), (b"[\x0d]", 4))
756 self.assertEqual(codecs.escape_decode(br"[\7]"), (b"[\x07]", 4))
757 self.assertEqual(codecs.escape_decode(br"[\8]"), (br"[\8]", 4))
758 self.assertEqual(codecs.escape_decode(br"[\78]"), (b"[\x078]", 5))
759 self.assertEqual(codecs.escape_decode(br"[\41]"), (b"[!]", 5))
760 self.assertEqual(codecs.escape_decode(br"[\418]"), (b"[!8]", 6))
761 self.assertEqual(codecs.escape_decode(br"[\101]"), (b"[A]", 6))
762 self.assertEqual(codecs.escape_decode(br"[\1010]"), (b"[A0]", 7))
763 self.assertEqual(codecs.escape_decode(br"[\501]"), (b"[A]", 6))
764 self.assertEqual(codecs.escape_decode(br"[\x41]"), (b"[A]", 6))
765 self.assertEqual(codecs.escape_decode(br"[\X41]"), (br"[\X41]", 6))
766 self.assertEqual(codecs.escape_decode(br"[\x410]"), (b"[A0]", 7))
767 for b in ''.join(map(chr, range(256))):
768 if b not in '\n"\'\\abtnvfr01234567x':
769 self.assertEqual(codecs.escape_decode('\\' + b),
770 ('\\' + b, 2))
771
772 def test_errors(self):
773 self.assertRaises(ValueError, codecs.escape_decode, br"\x")
774 self.assertRaises(ValueError, codecs.escape_decode, br"[\x]")
775 self.assertEqual(codecs.escape_decode(br"[\x]\x", "ignore"), (b"[]", 6))
776 self.assertEqual(codecs.escape_decode(br"[\x]\x", "replace"), (b"[?]?", 6))
777 self.assertRaises(ValueError, codecs.escape_decode, br"\x0")
778 self.assertRaises(ValueError, codecs.escape_decode, br"[\x0]")
779 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
780 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
781
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000782class RecodingTest(unittest.TestCase):
783 def test_recoding(self):
784 f = StringIO.StringIO()
785 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
786 f2.write(u"a")
787 f2.close()
788 # Python used to crash on this at exit because of a refcount
789 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000790
Martin v. Löwis2548c732003-04-18 10:39:54 +0000791# From RFC 3492
792punycode_testcases = [
793 # A Arabic (Egyptian):
794 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
795 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
796 "egbpdaj6bu4bxfgehfvwxn"),
797 # B Chinese (simplified):
798 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
799 "ihqwcrb4cv8a8dqg056pqjye"),
800 # C Chinese (traditional):
801 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
802 "ihqwctvzc91f659drss3x8bo0yb"),
803 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
804 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
805 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
806 u"\u0065\u0073\u006B\u0079",
807 "Proprostnemluvesky-uyb24dma41a"),
808 # E Hebrew:
809 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
810 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
811 u"\u05D1\u05E8\u05D9\u05EA",
812 "4dbcagdahymbxekheh6e0a7fei0b"),
813 # F Hindi (Devanagari):
814 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
815 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
816 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
817 u"\u0939\u0948\u0902",
818 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
819
820 #(G) Japanese (kanji and hiragana):
821 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
822 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
823 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
824
825 # (H) Korean (Hangul syllables):
826 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
827 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
828 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
829 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
830 "psd879ccm6fea98c"),
831
832 # (I) Russian (Cyrillic):
833 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
834 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
835 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
836 u"\u0438",
837 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
838
839 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
840 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
841 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
842 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
843 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
844 u"\u0061\u00F1\u006F\u006C",
845 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
846
847 # (K) Vietnamese:
848 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
849 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
850 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
851 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
852 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
853 u"\u0056\u0069\u1EC7\u0074",
854 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
855
Martin v. Löwis2548c732003-04-18 10:39:54 +0000856 #(L) 3<nen>B<gumi><kinpachi><sensei>
857 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
858 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000859
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860 # (M) <amuro><namie>-with-SUPER-MONKEYS
861 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
862 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
863 u"\u004F\u004E\u004B\u0045\u0059\u0053",
864 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
865
866 # (N) Hello-Another-Way-<sorezore><no><basho>
867 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
868 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
869 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
870 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
871
872 # (O) <hitotsu><yane><no><shita>2
873 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
874 "2-u9tlzr9756bt3uc0v"),
875
876 # (P) Maji<de>Koi<suru>5<byou><mae>
877 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
878 u"\u308B\u0035\u79D2\u524D",
879 "MajiKoi5-783gue6qz075azm5e"),
880
881 # (Q) <pafii>de<runba>
882 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
883 "de-jg4avhby1noc0d"),
884
885 # (R) <sono><supiido><de>
886 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
887 "d9juau41awczczp"),
888
889 # (S) -> $1.00 <-
890 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
891 u"\u003C\u002D",
892 "-> $1.00 <--")
893 ]
894
895for i in punycode_testcases:
896 if len(i)!=2:
897 print repr(i)
898
899class PunycodeTest(unittest.TestCase):
900 def test_encode(self):
901 for uni, puny in punycode_testcases:
902 # Need to convert both strings to lower case, since
903 # some of the extended encodings use upper case, but our
904 # code produces only lower case. Converting just puny to
905 # lower is also insufficient, since some of the input characters
906 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000907 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000908
909 def test_decode(self):
910 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000911 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000912
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000913class UnicodeInternalTest(unittest.TestCase):
914 def test_bug1251300(self):
915 # Decoding with unicode_internal used to not correctly handle "code
916 # points" above 0x10ffff on UCS-4 builds.
917 if sys.maxunicode > 0xffff:
918 ok = [
919 ("\x00\x10\xff\xff", u"\U0010ffff"),
920 ("\x00\x00\x01\x01", u"\U00000101"),
921 ("", u""),
922 ]
923 not_ok = [
924 "\x7f\xff\xff\xff",
925 "\x80\x00\x00\x00",
926 "\x81\x00\x00\x00",
927 "\x00",
928 "\x00\x00\x00\x00\x00",
929 ]
930 for internal, uni in ok:
931 if sys.byteorder == "little":
932 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000933 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000934 for internal in not_ok:
935 if sys.byteorder == "little":
936 internal = "".join(reversed(internal))
937 self.assertRaises(UnicodeDecodeError, internal.decode,
938 "unicode_internal")
939
940 def test_decode_error_attributes(self):
941 if sys.maxunicode > 0xffff:
942 try:
943 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
944 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000945 self.assertEqual("unicode_internal", ex.encoding)
946 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
947 self.assertEqual(4, ex.start)
948 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000949 else:
950 self.fail()
951
952 def test_decode_callback(self):
953 if sys.maxunicode > 0xffff:
954 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
955 decoder = codecs.getdecoder("unicode_internal")
956 ab = u"ab".encode("unicode_internal")
957 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
958 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000959 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000960
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000961 def test_encode_length(self):
962 # Issue 3739
963 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000964 self.assertEqual(encoder(u"a")[1], 1)
965 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000966
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000967 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +0000968 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +0000969
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
971nameprep_tests = [
972 # 3.1 Map to nothing.
973 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
974 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
975 '\xb8\x8f\xef\xbb\xbf',
976 'foobarbaz'),
977 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
978 ('CAFE',
979 'cafe'),
980 # 3.3 Case folding 8bit U+00DF (german sharp s).
981 # The original test case is bogus; it says \xc3\xdf
982 ('\xc3\x9f',
983 'ss'),
984 # 3.4 Case folding U+0130 (turkish capital I with dot).
985 ('\xc4\xb0',
986 'i\xcc\x87'),
987 # 3.5 Case folding multibyte U+0143 U+037A.
988 ('\xc5\x83\xcd\xba',
989 '\xc5\x84 \xce\xb9'),
990 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
991 # XXX: skip this as it fails in UCS-2 mode
992 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
993 # 'telc\xe2\x88\x95kg\xcf\x83'),
994 (None, None),
995 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
996 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
997 '\xc7\xb0 a'),
998 # 3.8 Case folding U+1FB7 and normalization.
999 ('\xe1\xbe\xb7',
1000 '\xe1\xbe\xb6\xce\xb9'),
1001 # 3.9 Self-reverting case folding U+01F0 and normalization.
1002 # The original test case is bogus, it says `\xc7\xf0'
1003 ('\xc7\xb0',
1004 '\xc7\xb0'),
1005 # 3.10 Self-reverting case folding U+0390 and normalization.
1006 ('\xce\x90',
1007 '\xce\x90'),
1008 # 3.11 Self-reverting case folding U+03B0 and normalization.
1009 ('\xce\xb0',
1010 '\xce\xb0'),
1011 # 3.12 Self-reverting case folding U+1E96 and normalization.
1012 ('\xe1\xba\x96',
1013 '\xe1\xba\x96'),
1014 # 3.13 Self-reverting case folding U+1F56 and normalization.
1015 ('\xe1\xbd\x96',
1016 '\xe1\xbd\x96'),
1017 # 3.14 ASCII space character U+0020.
1018 (' ',
1019 ' '),
1020 # 3.15 Non-ASCII 8bit space character U+00A0.
1021 ('\xc2\xa0',
1022 ' '),
1023 # 3.16 Non-ASCII multibyte space character U+1680.
1024 ('\xe1\x9a\x80',
1025 None),
1026 # 3.17 Non-ASCII multibyte space character U+2000.
1027 ('\xe2\x80\x80',
1028 ' '),
1029 # 3.18 Zero Width Space U+200b.
1030 ('\xe2\x80\x8b',
1031 ''),
1032 # 3.19 Non-ASCII multibyte space character U+3000.
1033 ('\xe3\x80\x80',
1034 ' '),
1035 # 3.20 ASCII control characters U+0010 U+007F.
1036 ('\x10\x7f',
1037 '\x10\x7f'),
1038 # 3.21 Non-ASCII 8bit control character U+0085.
1039 ('\xc2\x85',
1040 None),
1041 # 3.22 Non-ASCII multibyte control character U+180E.
1042 ('\xe1\xa0\x8e',
1043 None),
1044 # 3.23 Zero Width No-Break Space U+FEFF.
1045 ('\xef\xbb\xbf',
1046 ''),
1047 # 3.24 Non-ASCII control character U+1D175.
1048 ('\xf0\x9d\x85\xb5',
1049 None),
1050 # 3.25 Plane 0 private use character U+F123.
1051 ('\xef\x84\xa3',
1052 None),
1053 # 3.26 Plane 15 private use character U+F1234.
1054 ('\xf3\xb1\x88\xb4',
1055 None),
1056 # 3.27 Plane 16 private use character U+10F234.
1057 ('\xf4\x8f\x88\xb4',
1058 None),
1059 # 3.28 Non-character code point U+8FFFE.
1060 ('\xf2\x8f\xbf\xbe',
1061 None),
1062 # 3.29 Non-character code point U+10FFFF.
1063 ('\xf4\x8f\xbf\xbf',
1064 None),
1065 # 3.30 Surrogate code U+DF42.
1066 ('\xed\xbd\x82',
1067 None),
1068 # 3.31 Non-plain text character U+FFFD.
1069 ('\xef\xbf\xbd',
1070 None),
1071 # 3.32 Ideographic description character U+2FF5.
1072 ('\xe2\xbf\xb5',
1073 None),
1074 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001075 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 '\xcc\x81'),
1077 # 3.34 Left-to-right mark U+200E.
1078 ('\xe2\x80\x8e',
1079 None),
1080 # 3.35 Deprecated U+202A.
1081 ('\xe2\x80\xaa',
1082 None),
1083 # 3.36 Language tagging character U+E0001.
1084 ('\xf3\xa0\x80\x81',
1085 None),
1086 # 3.37 Language tagging character U+E0042.
1087 ('\xf3\xa0\x81\x82',
1088 None),
1089 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1090 ('foo\xd6\xbebar',
1091 None),
1092 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1093 ('foo\xef\xb5\x90bar',
1094 None),
1095 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1096 ('foo\xef\xb9\xb6bar',
1097 'foo \xd9\x8ebar'),
1098 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1099 ('\xd8\xa71',
1100 None),
1101 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1102 ('\xd8\xa71\xd8\xa8',
1103 '\xd8\xa71\xd8\xa8'),
1104 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001105 # Skip this test as we allow unassigned
1106 #('\xf3\xa0\x80\x82',
1107 # None),
1108 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109 # 3.44 Larger test (shrinking).
1110 # Original test case reads \xc3\xdf
1111 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1112 '\xaa\xce\xb0\xe2\x80\x80',
1113 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1114 # 3.45 Larger test (expanding).
1115 # Original test case reads \xc3\x9f
1116 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1117 '\x80',
1118 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1119 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1120 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1121 ]
1122
1123
1124class NameprepTest(unittest.TestCase):
1125 def test_nameprep(self):
1126 from encodings.idna import nameprep
1127 for pos, (orig, prepped) in enumerate(nameprep_tests):
1128 if orig is None:
1129 # Skipped
1130 continue
1131 # The Unicode strings are given in UTF-8
1132 orig = unicode(orig, "utf-8")
1133 if prepped is None:
1134 # Input contains prohibited characters
1135 self.assertRaises(UnicodeError, nameprep, orig)
1136 else:
1137 prepped = unicode(prepped, "utf-8")
1138 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001139 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001140 except Exception,e:
1141 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1142
Walter Dörwald78a0be62006-04-14 18:25:39 +00001143class IDNACodecTest(unittest.TestCase):
1144 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001145 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1146 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1147 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1148 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001149
1150 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001151 self.assertEqual(u"python.org".encode("idna"), "python.org")
1152 self.assertEqual("python.org.".encode("idna"), "python.org.")
1153 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1154 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001155
Martin v. Löwis8b595142005-08-25 11:03:38 +00001156 def test_stream(self):
1157 import StringIO
1158 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1159 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001160 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001161
Walter Dörwald78a0be62006-04-14 18:25:39 +00001162 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001163 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001164 "".join(codecs.iterdecode("python.org", "idna")),
1165 u"python.org"
1166 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001167 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001168 "".join(codecs.iterdecode("python.org.", "idna")),
1169 u"python.org."
1170 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001171 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001172 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1173 u"pyth\xf6n.org."
1174 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001175 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001176 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1177 u"pyth\xf6n.org."
1178 )
1179
1180 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001181 self.assertEqual(decoder.decode("xn--xam", ), u"")
1182 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1183 self.assertEqual(decoder.decode(u"rg"), u"")
1184 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001185
1186 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001187 self.assertEqual(decoder.decode("xn--xam", ), u"")
1188 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1189 self.assertEqual(decoder.decode("rg."), u"org.")
1190 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001191
1192 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001193 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001194 "".join(codecs.iterencode(u"python.org", "idna")),
1195 "python.org"
1196 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001197 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001198 "".join(codecs.iterencode(u"python.org.", "idna")),
1199 "python.org."
1200 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001201 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001202 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1203 "xn--pythn-mua.org."
1204 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001205 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001206 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1207 "xn--pythn-mua.org."
1208 )
1209
1210 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001211 self.assertEqual(encoder.encode(u"\xe4x"), "")
1212 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1213 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001214
1215 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001216 self.assertEqual(encoder.encode(u"\xe4x"), "")
1217 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1218 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001219
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001220class CodecsModuleTest(unittest.TestCase):
1221
1222 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001223 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001224 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001225 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001226 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001227 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1228
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001229 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001230 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001231 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001232 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001233 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001234 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001235 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1236
1237 def test_register(self):
1238 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001239 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001240
1241 def test_lookup(self):
1242 self.assertRaises(TypeError, codecs.lookup)
1243 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001244 self.assertRaises(LookupError, codecs.lookup, " ")
1245
1246 def test_getencoder(self):
1247 self.assertRaises(TypeError, codecs.getencoder)
1248 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1249
1250 def test_getdecoder(self):
1251 self.assertRaises(TypeError, codecs.getdecoder)
1252 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1253
1254 def test_getreader(self):
1255 self.assertRaises(TypeError, codecs.getreader)
1256 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1257
1258 def test_getwriter(self):
1259 self.assertRaises(TypeError, codecs.getwriter)
1260 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001261
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001262 def test_lookup_issue1813(self):
1263 # Issue #1813: under Turkish locales, lookup of some codecs failed
1264 # because 'I' is lowercased as a dotless "i"
1265 oldlocale = locale.getlocale(locale.LC_CTYPE)
1266 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1267 try:
1268 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1269 except locale.Error:
1270 # Unsupported locale on this system
1271 self.skipTest('test needs Turkish locale')
1272 c = codecs.lookup('ASCII')
1273 self.assertEqual(c.name, 'ascii')
1274
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001275class StreamReaderTest(unittest.TestCase):
1276
1277 def setUp(self):
1278 self.reader = codecs.getreader('utf-8')
1279 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1280
1281 def test_readlines(self):
1282 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001283 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001284
Georg Brandl8f99f812006-10-29 08:39:22 +00001285class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001286
Georg Brandl8f99f812006-10-29 08:39:22 +00001287 def test_basic(self):
1288 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001289 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001290 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001291
1292 f = StringIO.StringIO()
1293 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1294 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001295 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001296
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001297class Str2StrTest(unittest.TestCase):
1298
1299 def test_read(self):
1300 sin = "\x80".encode("base64_codec")
1301 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1302 sout = reader.read()
1303 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001304 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001305
1306 def test_readline(self):
1307 sin = "\x80".encode("base64_codec")
1308 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1309 sout = reader.readline()
1310 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001311 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001312
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001313all_unicode_encodings = [
1314 "ascii",
1315 "base64_codec",
1316 "big5",
1317 "big5hkscs",
1318 "charmap",
1319 "cp037",
1320 "cp1006",
1321 "cp1026",
1322 "cp1140",
1323 "cp1250",
1324 "cp1251",
1325 "cp1252",
1326 "cp1253",
1327 "cp1254",
1328 "cp1255",
1329 "cp1256",
1330 "cp1257",
1331 "cp1258",
1332 "cp424",
1333 "cp437",
1334 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001335 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001336 "cp737",
1337 "cp775",
1338 "cp850",
1339 "cp852",
1340 "cp855",
1341 "cp856",
1342 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001343 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001344 "cp860",
1345 "cp861",
1346 "cp862",
1347 "cp863",
1348 "cp864",
1349 "cp865",
1350 "cp866",
1351 "cp869",
1352 "cp874",
1353 "cp875",
1354 "cp932",
1355 "cp949",
1356 "cp950",
1357 "euc_jis_2004",
1358 "euc_jisx0213",
1359 "euc_jp",
1360 "euc_kr",
1361 "gb18030",
1362 "gb2312",
1363 "gbk",
1364 "hex_codec",
1365 "hp_roman8",
1366 "hz",
1367 "idna",
1368 "iso2022_jp",
1369 "iso2022_jp_1",
1370 "iso2022_jp_2",
1371 "iso2022_jp_2004",
1372 "iso2022_jp_3",
1373 "iso2022_jp_ext",
1374 "iso2022_kr",
1375 "iso8859_1",
1376 "iso8859_10",
1377 "iso8859_11",
1378 "iso8859_13",
1379 "iso8859_14",
1380 "iso8859_15",
1381 "iso8859_16",
1382 "iso8859_2",
1383 "iso8859_3",
1384 "iso8859_4",
1385 "iso8859_5",
1386 "iso8859_6",
1387 "iso8859_7",
1388 "iso8859_8",
1389 "iso8859_9",
1390 "johab",
1391 "koi8_r",
1392 "koi8_u",
1393 "latin_1",
1394 "mac_cyrillic",
1395 "mac_greek",
1396 "mac_iceland",
1397 "mac_latin2",
1398 "mac_roman",
1399 "mac_turkish",
1400 "palmos",
1401 "ptcp154",
1402 "punycode",
1403 "raw_unicode_escape",
1404 "rot_13",
1405 "shift_jis",
1406 "shift_jis_2004",
1407 "shift_jisx0213",
1408 "tis_620",
1409 "unicode_escape",
1410 "unicode_internal",
1411 "utf_16",
1412 "utf_16_be",
1413 "utf_16_le",
1414 "utf_7",
1415 "utf_8",
1416]
1417
1418if hasattr(codecs, "mbcs_encode"):
1419 all_unicode_encodings.append("mbcs")
1420
1421# The following encodings work only with str, not unicode
1422all_string_encodings = [
1423 "quopri_codec",
1424 "string_escape",
1425 "uu_codec",
1426]
1427
1428# The following encoding is not tested, because it's not supposed
1429# to work:
1430# "undefined"
1431
1432# The following encodings don't work in stateful mode
1433broken_unicode_with_streams = [
1434 "base64_codec",
1435 "hex_codec",
1436 "punycode",
1437 "unicode_internal"
1438]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001439broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001440
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001441# The following encodings only support "strict" mode
1442only_strict_mode = [
1443 "idna",
1444 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001445 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001446]
1447
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001448try:
1449 import bz2
1450except ImportError:
1451 pass
1452else:
1453 all_unicode_encodings.append("bz2_codec")
1454 broken_unicode_with_streams.append("bz2_codec")
1455
1456try:
1457 import zlib
1458except ImportError:
1459 pass
1460else:
1461 all_unicode_encodings.append("zlib_codec")
1462 broken_unicode_with_streams.append("zlib_codec")
1463
1464class BasicUnicodeTest(unittest.TestCase):
1465 def test_basics(self):
1466 s = u"abc123" # all codecs should be able to encode these
1467 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001468 name = codecs.lookup(encoding).name
1469 if encoding.endswith("_codec"):
1470 name += "_codec"
1471 elif encoding == "latin_1":
1472 name = "latin_1"
1473 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001474 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001475 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001476 (chars, size) = codecs.getdecoder(encoding)(bytes)
1477 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1478
1479 if encoding not in broken_unicode_with_streams:
1480 # check stream reader/writer
1481 q = Queue()
1482 writer = codecs.getwriter(encoding)(q)
1483 encodedresult = ""
1484 for c in s:
1485 writer.write(c)
1486 encodedresult += q.read()
1487 q = Queue()
1488 reader = codecs.getreader(encoding)(q)
1489 decodedresult = u""
1490 for c in encodedresult:
1491 q.write(c)
1492 decodedresult += reader.read()
1493 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1494
Georg Brandl2c9838e2006-10-29 14:39:09 +00001495 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001496 # check incremental decoder/encoder (fetched via the Python
1497 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001498 try:
1499 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001500 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001501 except LookupError: # no IncrementalEncoder
1502 pass
1503 else:
1504 # check incremental decoder/encoder
1505 encodedresult = ""
1506 for c in s:
1507 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001508 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001509 decoder = codecs.getincrementaldecoder(encoding)()
1510 decodedresult = u""
1511 for c in encodedresult:
1512 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001513 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001514 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1515
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001516 # check C API
1517 encodedresult = ""
1518 for c in s:
1519 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001520 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001521 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1522 decodedresult = u""
1523 for c in encodedresult:
1524 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001525 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001526 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1527
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001528 # check iterencode()/iterdecode()
1529 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1530 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1531
1532 # check iterencode()/iterdecode() with empty string
1533 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1534 self.assertEqual(result, u"")
1535
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001536 if encoding not in only_strict_mode:
1537 # check incremental decoder/encoder with errors argument
1538 try:
1539 encoder = codecs.getincrementalencoder(encoding)("ignore")
1540 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1541 except LookupError: # no IncrementalEncoder
1542 pass
1543 else:
1544 encodedresult = "".join(encoder.encode(c) for c in s)
1545 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1546 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1547 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001548
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001549 encodedresult = "".join(cencoder.encode(c) for c in s)
1550 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1551 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1552 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1553
Walter Dörwald729c31f2005-03-14 19:06:30 +00001554 def test_seek(self):
1555 # all codecs should be able to encode these
1556 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1557 for encoding in all_unicode_encodings:
1558 if encoding == "idna": # FIXME: See SF bug #1163178
1559 continue
1560 if encoding in broken_unicode_with_streams:
1561 continue
1562 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1563 for t in xrange(5):
1564 # Test that calling seek resets the internal codec state and buffers
1565 reader.seek(0, 0)
1566 line = reader.readline()
1567 self.assertEqual(s[:len(line)], line)
1568
Walter Dörwalde22d3392005-11-17 08:52:34 +00001569 def test_bad_decode_args(self):
1570 for encoding in all_unicode_encodings:
1571 decoder = codecs.getdecoder(encoding)
1572 self.assertRaises(TypeError, decoder)
1573 if encoding not in ("idna", "punycode"):
1574 self.assertRaises(TypeError, decoder, 42)
1575
1576 def test_bad_encode_args(self):
1577 for encoding in all_unicode_encodings:
1578 encoder = codecs.getencoder(encoding)
1579 self.assertRaises(TypeError, encoder)
1580
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001581 def test_encoding_map_type_initialized(self):
1582 from encodings import cp1140
1583 # This used to crash, we are only verifying there's no crash.
1584 table_type = type(cp1140.encoding_table)
1585 self.assertEqual(table_type, table_type)
1586
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001587class BasicStrTest(unittest.TestCase):
1588 def test_basics(self):
1589 s = "abc123"
1590 for encoding in all_string_encodings:
1591 (bytes, size) = codecs.getencoder(encoding)(s)
1592 self.assertEqual(size, len(s))
1593 (chars, size) = codecs.getdecoder(encoding)(bytes)
1594 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1595
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001596class CharmapTest(unittest.TestCase):
1597 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001598 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001599 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1600 (u"abc", 3)
1601 )
1602
Serhiy Storchaka95997452013-01-15 14:42:59 +02001603 self.assertRaises(UnicodeDecodeError,
1604 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1605 )
1606
1607 self.assertRaises(UnicodeDecodeError,
1608 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1609 )
1610
Ezio Melotti2623a372010-11-21 13:34:58 +00001611 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001612 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1613 (u"ab\ufffd", 3)
1614 )
1615
Ezio Melotti2623a372010-11-21 13:34:58 +00001616 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001617 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1618 (u"ab\ufffd", 3)
1619 )
1620
Ezio Melotti2623a372010-11-21 13:34:58 +00001621 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001622 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1623 (u"ab", 3)
1624 )
1625
Ezio Melotti2623a372010-11-21 13:34:58 +00001626 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001627 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1628 (u"ab", 3)
1629 )
1630
1631 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001632 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001633 codecs.charmap_decode(allbytes, "ignore", u""),
1634 (u"", len(allbytes))
1635 )
1636
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001637 def test_decode_with_int2str_map(self):
1638 self.assertEqual(
1639 codecs.charmap_decode("\x00\x01\x02", "strict",
1640 {0: u'a', 1: u'b', 2: u'c'}),
1641 (u"abc", 3)
1642 )
1643
1644 self.assertEqual(
1645 codecs.charmap_decode("\x00\x01\x02", "strict",
1646 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1647 (u"AaBbCc", 3)
1648 )
1649
1650 self.assertEqual(
1651 codecs.charmap_decode("\x00\x01\x02", "strict",
1652 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1653 (u"\U0010FFFFbc", 3)
1654 )
1655
1656 self.assertEqual(
1657 codecs.charmap_decode("\x00\x01\x02", "strict",
1658 {0: u'a', 1: u'b', 2: u''}),
1659 (u"ab", 3)
1660 )
1661
1662 self.assertRaises(UnicodeDecodeError,
1663 codecs.charmap_decode, "\x00\x01\x02", "strict",
1664 {0: u'a', 1: u'b'}
1665 )
1666
Serhiy Storchaka95997452013-01-15 14:42:59 +02001667 self.assertRaises(UnicodeDecodeError,
1668 codecs.charmap_decode, "\x00\x01\x02", "strict",
1669 {0: u'a', 1: u'b', 2: None}
1670 )
1671
1672 # Issue #14850
1673 self.assertRaises(UnicodeDecodeError,
1674 codecs.charmap_decode, "\x00\x01\x02", "strict",
1675 {0: u'a', 1: u'b', 2: u'\ufffe'}
1676 )
1677
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001678 self.assertEqual(
1679 codecs.charmap_decode("\x00\x01\x02", "replace",
1680 {0: u'a', 1: u'b'}),
1681 (u"ab\ufffd", 3)
1682 )
1683
1684 self.assertEqual(
1685 codecs.charmap_decode("\x00\x01\x02", "replace",
1686 {0: u'a', 1: u'b', 2: None}),
1687 (u"ab\ufffd", 3)
1688 )
1689
Serhiy Storchaka95997452013-01-15 14:42:59 +02001690 # Issue #14850
1691 self.assertEqual(
1692 codecs.charmap_decode("\x00\x01\x02", "replace",
1693 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1694 (u"ab\ufffd", 3)
1695 )
1696
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001697 self.assertEqual(
1698 codecs.charmap_decode("\x00\x01\x02", "ignore",
1699 {0: u'a', 1: u'b'}),
1700 (u"ab", 3)
1701 )
1702
1703 self.assertEqual(
1704 codecs.charmap_decode("\x00\x01\x02", "ignore",
1705 {0: u'a', 1: u'b', 2: None}),
1706 (u"ab", 3)
1707 )
1708
Serhiy Storchaka95997452013-01-15 14:42:59 +02001709 # Issue #14850
1710 self.assertEqual(
1711 codecs.charmap_decode("\x00\x01\x02", "ignore",
1712 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1713 (u"ab", 3)
1714 )
1715
1716 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001717 self.assertEqual(
1718 codecs.charmap_decode(allbytes, "ignore", {}),
1719 (u"", len(allbytes))
1720 )
1721
1722 def test_decode_with_int2int_map(self):
1723 a = ord(u'a')
1724 b = ord(u'b')
1725 c = ord(u'c')
1726
1727 self.assertEqual(
1728 codecs.charmap_decode("\x00\x01\x02", "strict",
1729 {0: a, 1: b, 2: c}),
1730 (u"abc", 3)
1731 )
1732
1733 # Issue #15379
1734 self.assertEqual(
1735 codecs.charmap_decode("\x00\x01\x02", "strict",
1736 {0: 0x10FFFF, 1: b, 2: c}),
1737 (u"\U0010FFFFbc", 3)
1738 )
1739
1740 self.assertRaises(TypeError,
1741 codecs.charmap_decode, "\x00\x01\x02", "strict",
1742 {0: 0x110000, 1: b, 2: c}
1743 )
1744
1745 self.assertRaises(UnicodeDecodeError,
1746 codecs.charmap_decode, "\x00\x01\x02", "strict",
1747 {0: a, 1: b},
1748 )
1749
Serhiy Storchaka95997452013-01-15 14:42:59 +02001750 self.assertRaises(UnicodeDecodeError,
1751 codecs.charmap_decode, "\x00\x01\x02", "strict",
1752 {0: a, 1: b, 2: 0xFFFE},
1753 )
1754
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001755 self.assertEqual(
1756 codecs.charmap_decode("\x00\x01\x02", "replace",
1757 {0: a, 1: b}),
1758 (u"ab\ufffd", 3)
1759 )
1760
1761 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001762 codecs.charmap_decode("\x00\x01\x02", "replace",
1763 {0: a, 1: b, 2: 0xFFFE}),
1764 (u"ab\ufffd", 3)
1765 )
1766
1767 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001768 codecs.charmap_decode("\x00\x01\x02", "ignore",
1769 {0: a, 1: b}),
1770 (u"ab", 3)
1771 )
1772
Serhiy Storchaka95997452013-01-15 14:42:59 +02001773 self.assertEqual(
1774 codecs.charmap_decode("\x00\x01\x02", "ignore",
1775 {0: a, 1: b, 2: 0xFFFE}),
1776 (u"ab", 3)
1777 )
1778
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001779
Georg Brandl8f99f812006-10-29 08:39:22 +00001780class WithStmtTest(unittest.TestCase):
1781 def test_encodedfile(self):
1782 f = StringIO.StringIO("\xc3\xbc")
1783 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001784 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001785
1786 def test_streamreaderwriter(self):
1787 f = StringIO.StringIO("\xc3\xbc")
1788 info = codecs.lookup("utf-8")
1789 with codecs.StreamReaderWriter(f, info.streamreader,
1790 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001791 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001792
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001793
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001794class UnicodeEscapeTest(unittest.TestCase):
1795 def test_empty(self):
1796 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1797 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1798
1799 def test_raw_encode(self):
1800 encode = codecs.unicode_escape_encode
1801 for b in range(32, 127):
1802 if b != ord('\\'):
1803 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1804
1805 def test_raw_decode(self):
1806 decode = codecs.unicode_escape_decode
1807 for b in range(256):
1808 if b != ord('\\'):
1809 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1810
1811 def test_escape_encode(self):
1812 encode = codecs.unicode_escape_encode
1813 check = coding_checker(self, encode)
1814 check(u'\t', r'\t')
1815 check(u'\n', r'\n')
1816 check(u'\r', r'\r')
1817 check(u'\\', r'\\')
1818 for b in range(32):
1819 if chr(b) not in '\t\n\r':
1820 check(unichr(b), '\\x%02x' % b)
1821 for b in range(127, 256):
1822 check(unichr(b), '\\x%02x' % b)
1823 check(u'\u20ac', r'\u20ac')
1824 check(u'\U0001d120', r'\U0001d120')
1825
1826 def test_escape_decode(self):
1827 decode = codecs.unicode_escape_decode
1828 check = coding_checker(self, decode)
1829 check("[\\\n]", u"[]")
1830 check(r'[\"]', u'["]')
1831 check(r"[\']", u"[']")
1832 check(r"[\\]", ur"[\]")
1833 check(r"[\a]", u"[\x07]")
1834 check(r"[\b]", u"[\x08]")
1835 check(r"[\t]", u"[\x09]")
1836 check(r"[\n]", u"[\x0a]")
1837 check(r"[\v]", u"[\x0b]")
1838 check(r"[\f]", u"[\x0c]")
1839 check(r"[\r]", u"[\x0d]")
1840 check(r"[\7]", u"[\x07]")
1841 check(r"[\8]", ur"[\8]")
1842 check(r"[\78]", u"[\x078]")
1843 check(r"[\41]", u"[!]")
1844 check(r"[\418]", u"[!8]")
1845 check(r"[\101]", u"[A]")
1846 check(r"[\1010]", u"[A0]")
1847 check(r"[\x41]", u"[A]")
1848 check(r"[\x410]", u"[A0]")
1849 check(r"\u20ac", u"\u20ac")
1850 check(r"\U0001d120", u"\U0001d120")
1851 for b in range(256):
1852 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1853 check('\\' + chr(b), u'\\' + unichr(b))
1854
1855 def test_decode_errors(self):
1856 decode = codecs.unicode_escape_decode
1857 for c, d in ('x', 2), ('u', 4), ('U', 4):
1858 for i in range(d):
1859 self.assertRaises(UnicodeDecodeError, decode,
1860 "\\" + c + "0"*i)
1861 self.assertRaises(UnicodeDecodeError, decode,
1862 "[\\" + c + "0"*i + "]")
1863 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1864 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1865 self.assertEqual(decode(data, "replace"),
1866 (u"[\ufffd]\ufffd", len(data)))
1867 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1868 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1869 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1870
1871
Victor Stinner262be5e2010-05-22 02:11:07 +00001872class BomTest(unittest.TestCase):
1873 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001874 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001875 tests = ("utf-16",
1876 "utf-16-le",
1877 "utf-16-be",
1878 "utf-32",
1879 "utf-32-le",
1880 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001881 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001882 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001883 # Check if the BOM is written only once
1884 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001885 f.write(data)
1886 f.write(data)
1887 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001888 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001889 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001890 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001891
Victor Stinner7df55da2010-05-22 13:37:56 +00001892 # Check that the BOM is written after a seek(0)
1893 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1894 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001895 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001896 f.seek(0)
1897 f.write(data)
1898 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001899 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001900
1901 # (StreamWriter) Check that the BOM is written after a seek(0)
1902 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1903 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001904 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001905 f.writer.seek(0)
1906 f.writer.write(data)
1907 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001908 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001909
1910 # Check that the BOM is not written after a seek() at a position
1911 # different than the start
1912 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1913 f.write(data)
1914 f.seek(f.tell())
1915 f.write(data)
1916 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001917 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001918
1919 # (StreamWriter) Check that the BOM is not written after a seek()
1920 # at a position different than the start
1921 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1922 f.writer.write(data)
1923 f.writer.seek(f.writer.tell())
1924 f.writer.write(data)
1925 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001926 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00001927
Victor Stinner262be5e2010-05-22 02:11:07 +00001928
Fred Drake2e2be372001-09-20 21:33:42 +00001929def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001930 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001931 UTF32Test,
1932 UTF32LETest,
1933 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001934 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001935 UTF16LETest,
1936 UTF16BETest,
1937 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001938 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001939 UTF7Test,
1940 UTF16ExTest,
1941 ReadBufferTest,
1942 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001943 EscapeDecodeTest,
1944 RecodingTest,
1945 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001946 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001947 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001948 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001949 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001950 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001951 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001952 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001953 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001954 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001955 CharmapTest,
1956 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001957 UnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00001958 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001959 )
Fred Drake2e2be372001-09-20 21:33:42 +00001960
1961
1962if __name__ == "__main__":
1963 test_main()