blob: c9a25154aef94bec5cdc9a086b0e4236a0c2a1ca [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
100 # Test long lines (multiple calls to read() in readline())
101 vw = []
102 vwo = []
103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
104 vw.append((i*200)*u"\3042" + lineend)
105 vwo.append((i*200)*u"\3042")
106 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
108
109 # Test lines where the first read might end with \r, so the
110 # reader has to look ahead whether this is a lone \r or a \r\n
111 for size in xrange(80):
112 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000113 s = 10*(size*u"a" + lineend + u"xxx\n")
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=True),
118 size*u"a" + lineend,
119 )
120 reader = getreader(s)
121 for i in xrange(10):
122 self.assertEqual(
123 reader.readline(keepends=False),
124 size*u"a",
125 )
126
127 def test_bug1175396(self):
128 s = [
129 '<%!--===================================================\r\n',
130 ' BLOG index page: show recent articles,\r\n',
131 ' today\'s articles, or articles of a specific date.\r\n',
132 '========================================================--%>\r\n',
133 '<%@inputencoding="ISO-8859-1"%>\r\n',
134 '<%@pagetemplate=TEMPLATE.y%>\r\n',
135 '<%@import=import frog.util, frog%>\r\n',
136 '<%@import=import frog.objects%>\r\n',
137 '<%@import=from frog.storageerrors import StorageError%>\r\n',
138 '<%\r\n',
139 '\r\n',
140 'import logging\r\n',
141 'log=logging.getLogger("Snakelets.logger")\r\n',
142 '\r\n',
143 '\r\n',
144 'user=self.SessionCtx.user\r\n',
145 'storageEngine=self.SessionCtx.storageEngine\r\n',
146 '\r\n',
147 '\r\n',
148 'def readArticlesFromDate(date, count=None):\r\n',
149 ' entryids=storageEngine.listBlogEntries(date)\r\n',
150 ' entryids.reverse() # descending\r\n',
151 ' if count:\r\n',
152 ' entryids=entryids[:count]\r\n',
153 ' try:\r\n',
154 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
155 ' except StorageError,x:\r\n',
156 ' log.error("Error loading articles: "+str(x))\r\n',
157 ' self.abort("cannot load articles")\r\n',
158 '\r\n',
159 'showdate=None\r\n',
160 '\r\n',
161 'arg=self.Request.getArg()\r\n',
162 'if arg=="today":\r\n',
163 ' #-------------------- TODAY\'S ARTICLES\r\n',
164 ' self.write("<h2>Today\'s articles</h2>")\r\n',
165 ' showdate = frog.util.isodatestr() \r\n',
166 ' entries = readArticlesFromDate(showdate)\r\n',
167 'elif arg=="active":\r\n',
168 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
169 ' self.Yredirect("active.y")\r\n',
170 'elif arg=="login":\r\n',
171 ' #-------------------- LOGIN PAGE redirect\r\n',
172 ' self.Yredirect("login.y")\r\n',
173 'elif arg=="date":\r\n',
174 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
175 ' showdate = self.Request.getParameter("date")\r\n',
176 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
177 ' entries = readArticlesFromDate(showdate)\r\n',
178 'else:\r\n',
179 ' #-------------------- RECENT ARTICLES\r\n',
180 ' self.write("<h2>Recent articles</h2>")\r\n',
181 ' dates=storageEngine.listBlogEntryDates()\r\n',
182 ' if dates:\r\n',
183 ' entries=[]\r\n',
184 ' SHOWAMOUNT=10\r\n',
185 ' for showdate in dates:\r\n',
186 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
187 ' if len(entries)>=SHOWAMOUNT:\r\n',
188 ' break\r\n',
189 ' \r\n',
190 ]
191 stream = StringIO.StringIO("".join(s).encode(self.encoding))
192 reader = codecs.getreader(self.encoding)(stream)
193 for (i, line) in enumerate(reader):
194 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000195
196 def test_readlinequeue(self):
197 q = Queue()
198 writer = codecs.getwriter(self.encoding)(q)
199 reader = codecs.getreader(self.encoding)(q)
200
201 # No lineends
202 writer.write(u"foo\r")
203 self.assertEqual(reader.readline(keepends=False), u"foo")
204 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000205 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000206 self.assertEqual(reader.readline(keepends=False), u"bar")
207 writer.write(u"baz")
208 self.assertEqual(reader.readline(keepends=False), u"baz")
209 self.assertEqual(reader.readline(keepends=False), u"")
210
211 # Lineends
212 writer.write(u"foo\r")
213 self.assertEqual(reader.readline(keepends=True), u"foo\r")
214 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000215 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216 self.assertEqual(reader.readline(keepends=True), u"bar\r")
217 writer.write(u"baz")
218 self.assertEqual(reader.readline(keepends=True), u"baz")
219 self.assertEqual(reader.readline(keepends=True), u"")
220 writer.write(u"foo\r\n")
221 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
222
Walter Dörwald9fa09462005-01-10 12:01:39 +0000223 def test_bug1098990_a(self):
224 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
225 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
226 s3 = u"next line.\r\n"
227
228 s = (s1+s2+s3).encode(self.encoding)
229 stream = StringIO.StringIO(s)
230 reader = codecs.getreader(self.encoding)(stream)
231 self.assertEqual(reader.readline(), s1)
232 self.assertEqual(reader.readline(), s2)
233 self.assertEqual(reader.readline(), s3)
234 self.assertEqual(reader.readline(), u"")
235
236 def test_bug1098990_b(self):
237 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
238 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
239 s3 = u"stillokay:bbbbxx\r\n"
240 s4 = u"broken!!!!badbad\r\n"
241 s5 = u"againokay.\r\n"
242
243 s = (s1+s2+s3+s4+s5).encode(self.encoding)
244 stream = StringIO.StringIO(s)
245 reader = codecs.getreader(self.encoding)(stream)
246 self.assertEqual(reader.readline(), s1)
247 self.assertEqual(reader.readline(), s2)
248 self.assertEqual(reader.readline(), s3)
249 self.assertEqual(reader.readline(), s4)
250 self.assertEqual(reader.readline(), s5)
251 self.assertEqual(reader.readline(), u"")
252
Walter Dörwald6e390802007-08-17 16:41:28 +0000253class UTF32Test(ReadTest):
254 encoding = "utf-32"
255
256 spamle = ('\xff\xfe\x00\x00'
257 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
258 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
259 spambe = ('\x00\x00\xfe\xff'
260 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
261 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
262
263 def test_only_one_bom(self):
264 _,_,reader,writer = codecs.lookup(self.encoding)
265 # encode some stream
266 s = StringIO.StringIO()
267 f = writer(s)
268 f.write(u"spam")
269 f.write(u"spam")
270 d = s.getvalue()
271 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000272 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000273 # try to read it back
274 s = StringIO.StringIO(d)
275 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000276 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000277
278 def test_badbom(self):
279 s = StringIO.StringIO(4*"\xff")
280 f = codecs.getreader(self.encoding)(s)
281 self.assertRaises(UnicodeError, f.read)
282
283 s = StringIO.StringIO(8*"\xff")
284 f = codecs.getreader(self.encoding)(s)
285 self.assertRaises(UnicodeError, f.read)
286
287 def test_partial(self):
288 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200289 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000290 [
291 u"", # first byte of BOM read
292 u"", # second byte of BOM read
293 u"", # third byte of BOM read
294 u"", # fourth byte of BOM read => byteorder known
295 u"",
296 u"",
297 u"",
298 u"\x00",
299 u"\x00",
300 u"\x00",
301 u"\x00",
302 u"\x00\xff",
303 u"\x00\xff",
304 u"\x00\xff",
305 u"\x00\xff",
306 u"\x00\xff\u0100",
307 u"\x00\xff\u0100",
308 u"\x00\xff\u0100",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200311 u"\x00\xff\u0100\uffff",
312 u"\x00\xff\u0100\uffff",
313 u"\x00\xff\u0100\uffff",
314 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000315 ]
316 )
317
Georg Brandle9741f32009-09-17 11:28:09 +0000318 def test_handlers(self):
319 self.assertEqual((u'\ufffd', 1),
320 codecs.utf_32_decode('\x01', 'replace', True))
321 self.assertEqual((u'', 1),
322 codecs.utf_32_decode('\x01', 'ignore', True))
323
Walter Dörwald6e390802007-08-17 16:41:28 +0000324 def test_errors(self):
325 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
326 "\xff", "strict", True)
327
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000328 def test_issue8941(self):
329 # Issue #8941: insufficient result allocation when decoding into
330 # surrogate pairs on UCS-2 builds.
331 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
332 self.assertEqual(u'\U00010000' * 1024,
333 codecs.utf_32_decode(encoded_le)[0])
334 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
335 self.assertEqual(u'\U00010000' * 1024,
336 codecs.utf_32_decode(encoded_be)[0])
337
Walter Dörwald6e390802007-08-17 16:41:28 +0000338class UTF32LETest(ReadTest):
339 encoding = "utf-32-le"
340
341 def test_partial(self):
342 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200343 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000344 [
345 u"",
346 u"",
347 u"",
348 u"\x00",
349 u"\x00",
350 u"\x00",
351 u"\x00",
352 u"\x00\xff",
353 u"\x00\xff",
354 u"\x00\xff",
355 u"\x00\xff",
356 u"\x00\xff\u0100",
357 u"\x00\xff\u0100",
358 u"\x00\xff\u0100",
359 u"\x00\xff\u0100",
360 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200361 u"\x00\xff\u0100\uffff",
362 u"\x00\xff\u0100\uffff",
363 u"\x00\xff\u0100\uffff",
364 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000365 ]
366 )
367
368 def test_simple(self):
369 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
370
371 def test_errors(self):
372 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
373 "\xff", "strict", True)
374
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000375 def test_issue8941(self):
376 # Issue #8941: insufficient result allocation when decoding into
377 # surrogate pairs on UCS-2 builds.
378 encoded = '\x00\x00\x01\x00' * 1024
379 self.assertEqual(u'\U00010000' * 1024,
380 codecs.utf_32_le_decode(encoded)[0])
381
Walter Dörwald6e390802007-08-17 16:41:28 +0000382class UTF32BETest(ReadTest):
383 encoding = "utf-32-be"
384
385 def test_partial(self):
386 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200387 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000388 [
389 u"",
390 u"",
391 u"",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100",
404 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff",
408 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000409 ]
410 )
411
412 def test_simple(self):
413 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
414
415 def test_errors(self):
416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
417 "\xff", "strict", True)
418
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000419 def test_issue8941(self):
420 # Issue #8941: insufficient result allocation when decoding into
421 # surrogate pairs on UCS-2 builds.
422 encoded = '\x00\x01\x00\x00' * 1024
423 self.assertEqual(u'\U00010000' * 1024,
424 codecs.utf_32_be_decode(encoded)[0])
425
426
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000427class UTF16Test(ReadTest):
428 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000429
430 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
431 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
432
433 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000434 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000435 # encode some stream
436 s = StringIO.StringIO()
437 f = writer(s)
438 f.write(u"spam")
439 f.write(u"spam")
440 d = s.getvalue()
441 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000442 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000443 # try to read it back
444 s = StringIO.StringIO(d)
445 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000446 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000447
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000448 def test_badbom(self):
449 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000450 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000451 self.assertRaises(UnicodeError, f.read)
452
453 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000454 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000455 self.assertRaises(UnicodeError, f.read)
456
Walter Dörwald69652032004-09-07 20:24:22 +0000457 def test_partial(self):
458 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200459 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000460 [
461 u"", # first byte of BOM read
462 u"", # second byte of BOM read => byteorder known
463 u"",
464 u"\x00",
465 u"\x00",
466 u"\x00\xff",
467 u"\x00\xff",
468 u"\x00\xff\u0100",
469 u"\x00\xff\u0100",
470 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200471 u"\x00\xff\u0100\uffff",
472 u"\x00\xff\u0100\uffff",
473 u"\x00\xff\u0100\uffff",
474 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000475 ]
476 )
477
Georg Brandle9741f32009-09-17 11:28:09 +0000478 def test_handlers(self):
479 self.assertEqual((u'\ufffd', 1),
480 codecs.utf_16_decode('\x01', 'replace', True))
481 self.assertEqual((u'', 1),
482 codecs.utf_16_decode('\x01', 'ignore', True))
483
Walter Dörwalde22d3392005-11-17 08:52:34 +0000484 def test_errors(self):
485 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
486
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000487 def test_bug691291(self):
488 # Files are always opened in binary mode, even if no binary mode was
489 # specified. This means that no automatic conversion of '\n' is done
490 # on reading and writing.
491 s1 = u'Hello\r\nworld\r\n'
492
493 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200494 self.addCleanup(test_support.unlink, test_support.TESTFN)
495 with open(test_support.TESTFN, 'wb') as fp:
496 fp.write(s)
497 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
498 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000499
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500class UTF16LETest(ReadTest):
501 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000502
503 def test_partial(self):
504 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200505 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000506 [
507 u"",
508 u"\x00",
509 u"\x00",
510 u"\x00\xff",
511 u"\x00\xff",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff",
518 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000519 ]
520 )
521
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200523 tests = [
524 (b'\xff', u'\ufffd'),
525 (b'A\x00Z', u'A\ufffd'),
526 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
527 (b'\x00\xd8', u'\ufffd'),
528 (b'\x00\xd8A', u'\ufffd'),
529 (b'\x00\xd8A\x00', u'\ufffdA'),
530 (b'\x00\xdcA\x00', u'\ufffdA'),
531 ]
532 for raw, expected in tests:
533 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
534 raw, 'strict', True)
535 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000536
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000537class UTF16BETest(ReadTest):
538 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000539
540 def test_partial(self):
541 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200542 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000543 [
544 u"",
545 u"\x00",
546 u"\x00",
547 u"\x00\xff",
548 u"\x00\xff",
549 u"\x00\xff\u0100",
550 u"\x00\xff\u0100",
551 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200552 u"\x00\xff\u0100\uffff",
553 u"\x00\xff\u0100\uffff",
554 u"\x00\xff\u0100\uffff",
555 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200560 tests = [
561 (b'\xff', u'\ufffd'),
562 (b'\x00A\xff', u'A\ufffd'),
563 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
564 (b'\xd8\x00', u'\ufffd'),
565 (b'\xd8\x00\xdc', u'\ufffd'),
566 (b'\xd8\x00\x00A', u'\ufffdA'),
567 (b'\xdc\x00\x00A', u'\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574class UTF8Test(ReadTest):
575 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000576
577 def test_partial(self):
578 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200579 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000580 [
581 u"\x00",
582 u"\x00",
583 u"\x00\xff",
584 u"\x00\xff",
585 u"\x00\xff\u07ff",
586 u"\x00\xff\u07ff",
587 u"\x00\xff\u07ff",
588 u"\x00\xff\u07ff\u0800",
589 u"\x00\xff\u07ff\u0800",
590 u"\x00\xff\u07ff\u0800",
591 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200592 u"\x00\xff\u07ff\u0800\uffff",
593 u"\x00\xff\u07ff\u0800\uffff",
594 u"\x00\xff\u07ff\u0800\uffff",
595 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000596 ]
597 )
598
Walter Dörwalde22d3392005-11-17 08:52:34 +0000599class UTF7Test(ReadTest):
600 encoding = "utf-7"
601
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000602 def test_partial(self):
603 self.check_partial(
604 u"a+-b",
605 [
606 u"a",
607 u"a",
608 u"a+",
609 u"a+-",
610 u"a+-b",
611 ]
612 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000613
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300614 def test_errors(self):
615 tests = [
616 ('a\xffb', u'a\ufffdb'),
617 ('a+IK', u'a\ufffd'),
618 ('a+IK-b', u'a\ufffdb'),
619 ('a+IK,b', u'a\ufffdb'),
620 ('a+IKx', u'a\u20ac\ufffd'),
621 ('a+IKx-b', u'a\u20ac\ufffdb'),
622 ('a+IKwgr', u'a\u20ac\ufffd'),
623 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
624 ('a+IKwgr,', u'a\u20ac\ufffd'),
625 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
626 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
627 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
628 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
629 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
630 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
631 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
632 ]
633 for raw, expected in tests:
634 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
635 raw, 'strict', True)
636 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
637
638 def test_nonbmp(self):
639 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
640 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
641 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
642
Walter Dörwalde22d3392005-11-17 08:52:34 +0000643class UTF16ExTest(unittest.TestCase):
644
645 def test_errors(self):
646 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
647
648 def test_bad_args(self):
649 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
650
651class ReadBufferTest(unittest.TestCase):
652
653 def test_array(self):
654 import array
655 self.assertEqual(
656 codecs.readbuffer_encode(array.array("c", "spam")),
657 ("spam", 4)
658 )
659
660 def test_empty(self):
661 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
662
663 def test_bad_args(self):
664 self.assertRaises(TypeError, codecs.readbuffer_encode)
665 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
666
667class CharBufferTest(unittest.TestCase):
668
669 def test_string(self):
670 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
671
672 def test_empty(self):
673 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
674
675 def test_bad_args(self):
676 self.assertRaises(TypeError, codecs.charbuffer_encode)
677 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
678
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000679class UTF8SigTest(ReadTest):
680 encoding = "utf-8-sig"
681
682 def test_partial(self):
683 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200684 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000685 [
686 u"",
687 u"",
688 u"", # First BOM has been read and skipped
689 u"",
690 u"",
691 u"\ufeff", # Second BOM has been read and emitted
692 u"\ufeff\x00", # "\x00" read and emitted
693 u"\ufeff\x00", # First byte of encoded u"\xff" read
694 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
695 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
696 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
697 u"\ufeff\x00\xff\u07ff",
698 u"\ufeff\x00\xff\u07ff",
699 u"\ufeff\x00\xff\u07ff\u0800",
700 u"\ufeff\x00\xff\u07ff\u0800",
701 u"\ufeff\x00\xff\u07ff\u0800",
702 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200703 u"\ufeff\x00\xff\u07ff\u0800\uffff",
704 u"\ufeff\x00\xff\u07ff\u0800\uffff",
705 u"\ufeff\x00\xff\u07ff\u0800\uffff",
706 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000707 ]
708 )
709
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000710 def test_bug1601501(self):
711 # SF bug #1601501: check that the codec works with a buffer
712 unicode("\xef\xbb\xbf", "utf-8-sig")
713
Walter Dörwald42348272007-04-12 10:35:00 +0000714 def test_bom(self):
715 d = codecs.getincrementaldecoder("utf-8-sig")()
716 s = u"spam"
717 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
718
Walter Dörwald183744d2007-11-19 12:41:10 +0000719 def test_stream_bom(self):
720 unistring = u"ABC\u00A1\u2200XYZ"
721 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
722
723 reader = codecs.getreader("utf-8-sig")
724 for sizehint in [None] + range(1, 11) + \
725 [64, 128, 256, 512, 1024]:
726 istream = reader(StringIO.StringIO(bytestring))
727 ostream = StringIO.StringIO()
728 while 1:
729 if sizehint is not None:
730 data = istream.read(sizehint)
731 else:
732 data = istream.read()
733
734 if not data:
735 break
736 ostream.write(data)
737
738 got = ostream.getvalue()
739 self.assertEqual(got, unistring)
740
741 def test_stream_bare(self):
742 unistring = u"ABC\u00A1\u2200XYZ"
743 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
744
745 reader = codecs.getreader("utf-8-sig")
746 for sizehint in [None] + range(1, 11) + \
747 [64, 128, 256, 512, 1024]:
748 istream = reader(StringIO.StringIO(bytestring))
749 ostream = StringIO.StringIO()
750 while 1:
751 if sizehint is not None:
752 data = istream.read(sizehint)
753 else:
754 data = istream.read()
755
756 if not data:
757 break
758 ostream.write(data)
759
760 got = ostream.getvalue()
761 self.assertEqual(got, unistring)
762
Walter Dörwald8709a422002-09-03 13:53:40 +0000763class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000764 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000765 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000766
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200767 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200768 decode = codecs.escape_decode
769 for b in range(256):
770 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200771 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200772 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200773
774 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200775 decode = codecs.escape_decode
776 check = coding_checker(self, decode)
777 check(b"[\\\n]", b"[]")
778 check(br'[\"]', b'["]')
779 check(br"[\']", b"[']")
780 check(br"[\\]", br"[\]")
781 check(br"[\a]", b"[\x07]")
782 check(br"[\b]", b"[\x08]")
783 check(br"[\t]", b"[\x09]")
784 check(br"[\n]", b"[\x0a]")
785 check(br"[\v]", b"[\x0b]")
786 check(br"[\f]", b"[\x0c]")
787 check(br"[\r]", b"[\x0d]")
788 check(br"[\7]", b"[\x07]")
789 check(br"[\8]", br"[\8]")
790 check(br"[\78]", b"[\x078]")
791 check(br"[\41]", b"[!]")
792 check(br"[\418]", b"[!8]")
793 check(br"[\101]", b"[A]")
794 check(br"[\1010]", b"[A0]")
795 check(br"[\501]", b"[A]")
796 check(br"[\x41]", b"[A]")
797 check(br"[\X41]", br"[\X41]")
798 check(br"[\x410]", b"[A0]")
799 for b in range(256):
800 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200801 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200802 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200803
804 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200805 decode = codecs.escape_decode
806 self.assertRaises(ValueError, decode, br"\x")
807 self.assertRaises(ValueError, decode, br"[\x]")
808 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
809 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
810 self.assertRaises(ValueError, decode, br"\x0")
811 self.assertRaises(ValueError, decode, br"[\x0]")
812 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
813 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200814
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000815class RecodingTest(unittest.TestCase):
816 def test_recoding(self):
817 f = StringIO.StringIO()
818 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
819 f2.write(u"a")
820 f2.close()
821 # Python used to crash on this at exit because of a refcount
822 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000823
Martin v. Löwis2548c732003-04-18 10:39:54 +0000824# From RFC 3492
825punycode_testcases = [
826 # A Arabic (Egyptian):
827 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
828 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
829 "egbpdaj6bu4bxfgehfvwxn"),
830 # B Chinese (simplified):
831 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
832 "ihqwcrb4cv8a8dqg056pqjye"),
833 # C Chinese (traditional):
834 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
835 "ihqwctvzc91f659drss3x8bo0yb"),
836 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
837 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
838 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
839 u"\u0065\u0073\u006B\u0079",
840 "Proprostnemluvesky-uyb24dma41a"),
841 # E Hebrew:
842 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
843 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
844 u"\u05D1\u05E8\u05D9\u05EA",
845 "4dbcagdahymbxekheh6e0a7fei0b"),
846 # F Hindi (Devanagari):
847 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
848 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
849 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
850 u"\u0939\u0948\u0902",
851 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
852
853 #(G) Japanese (kanji and hiragana):
854 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
855 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
856 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
857
858 # (H) Korean (Hangul syllables):
859 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
860 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
861 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
862 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
863 "psd879ccm6fea98c"),
864
865 # (I) Russian (Cyrillic):
866 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
867 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
868 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
869 u"\u0438",
870 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
871
872 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
873 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
874 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
875 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
876 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
877 u"\u0061\u00F1\u006F\u006C",
878 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
879
880 # (K) Vietnamese:
881 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
882 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
883 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
884 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
885 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
886 u"\u0056\u0069\u1EC7\u0074",
887 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
888
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889 #(L) 3<nen>B<gumi><kinpachi><sensei>
890 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
891 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000892
Martin v. Löwis2548c732003-04-18 10:39:54 +0000893 # (M) <amuro><namie>-with-SUPER-MONKEYS
894 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
895 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
896 u"\u004F\u004E\u004B\u0045\u0059\u0053",
897 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
898
899 # (N) Hello-Another-Way-<sorezore><no><basho>
900 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
901 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
902 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
903 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
904
905 # (O) <hitotsu><yane><no><shita>2
906 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
907 "2-u9tlzr9756bt3uc0v"),
908
909 # (P) Maji<de>Koi<suru>5<byou><mae>
910 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
911 u"\u308B\u0035\u79D2\u524D",
912 "MajiKoi5-783gue6qz075azm5e"),
913
914 # (Q) <pafii>de<runba>
915 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
916 "de-jg4avhby1noc0d"),
917
918 # (R) <sono><supiido><de>
919 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
920 "d9juau41awczczp"),
921
922 # (S) -> $1.00 <-
923 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
924 u"\u003C\u002D",
925 "-> $1.00 <--")
926 ]
927
928for i in punycode_testcases:
929 if len(i)!=2:
930 print repr(i)
931
932class PunycodeTest(unittest.TestCase):
933 def test_encode(self):
934 for uni, puny in punycode_testcases:
935 # Need to convert both strings to lower case, since
936 # some of the extended encodings use upper case, but our
937 # code produces only lower case. Converting just puny to
938 # lower is also insufficient, since some of the input characters
939 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000940 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000941
942 def test_decode(self):
943 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000944 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000946class UnicodeInternalTest(unittest.TestCase):
947 def test_bug1251300(self):
948 # Decoding with unicode_internal used to not correctly handle "code
949 # points" above 0x10ffff on UCS-4 builds.
950 if sys.maxunicode > 0xffff:
951 ok = [
952 ("\x00\x10\xff\xff", u"\U0010ffff"),
953 ("\x00\x00\x01\x01", u"\U00000101"),
954 ("", u""),
955 ]
956 not_ok = [
957 "\x7f\xff\xff\xff",
958 "\x80\x00\x00\x00",
959 "\x81\x00\x00\x00",
960 "\x00",
961 "\x00\x00\x00\x00\x00",
962 ]
963 for internal, uni in ok:
964 if sys.byteorder == "little":
965 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +0000966 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000967 for internal in not_ok:
968 if sys.byteorder == "little":
969 internal = "".join(reversed(internal))
970 self.assertRaises(UnicodeDecodeError, internal.decode,
971 "unicode_internal")
972
973 def test_decode_error_attributes(self):
974 if sys.maxunicode > 0xffff:
975 try:
976 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
977 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +0000978 self.assertEqual("unicode_internal", ex.encoding)
979 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
980 self.assertEqual(4, ex.start)
981 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000982 else:
983 self.fail()
984
985 def test_decode_callback(self):
986 if sys.maxunicode > 0xffff:
987 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
988 decoder = codecs.getdecoder("unicode_internal")
989 ab = u"ab".encode("unicode_internal")
990 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
991 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +0000992 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000993
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000994 def test_encode_length(self):
995 # Issue 3739
996 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +0000997 self.assertEqual(encoder(u"a")[1], 1)
998 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000999
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001000 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001001 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001002
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1004nameprep_tests = [
1005 # 3.1 Map to nothing.
1006 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1007 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1008 '\xb8\x8f\xef\xbb\xbf',
1009 'foobarbaz'),
1010 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1011 ('CAFE',
1012 'cafe'),
1013 # 3.3 Case folding 8bit U+00DF (german sharp s).
1014 # The original test case is bogus; it says \xc3\xdf
1015 ('\xc3\x9f',
1016 'ss'),
1017 # 3.4 Case folding U+0130 (turkish capital I with dot).
1018 ('\xc4\xb0',
1019 'i\xcc\x87'),
1020 # 3.5 Case folding multibyte U+0143 U+037A.
1021 ('\xc5\x83\xcd\xba',
1022 '\xc5\x84 \xce\xb9'),
1023 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1024 # XXX: skip this as it fails in UCS-2 mode
1025 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1026 # 'telc\xe2\x88\x95kg\xcf\x83'),
1027 (None, None),
1028 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1029 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1030 '\xc7\xb0 a'),
1031 # 3.8 Case folding U+1FB7 and normalization.
1032 ('\xe1\xbe\xb7',
1033 '\xe1\xbe\xb6\xce\xb9'),
1034 # 3.9 Self-reverting case folding U+01F0 and normalization.
1035 # The original test case is bogus, it says `\xc7\xf0'
1036 ('\xc7\xb0',
1037 '\xc7\xb0'),
1038 # 3.10 Self-reverting case folding U+0390 and normalization.
1039 ('\xce\x90',
1040 '\xce\x90'),
1041 # 3.11 Self-reverting case folding U+03B0 and normalization.
1042 ('\xce\xb0',
1043 '\xce\xb0'),
1044 # 3.12 Self-reverting case folding U+1E96 and normalization.
1045 ('\xe1\xba\x96',
1046 '\xe1\xba\x96'),
1047 # 3.13 Self-reverting case folding U+1F56 and normalization.
1048 ('\xe1\xbd\x96',
1049 '\xe1\xbd\x96'),
1050 # 3.14 ASCII space character U+0020.
1051 (' ',
1052 ' '),
1053 # 3.15 Non-ASCII 8bit space character U+00A0.
1054 ('\xc2\xa0',
1055 ' '),
1056 # 3.16 Non-ASCII multibyte space character U+1680.
1057 ('\xe1\x9a\x80',
1058 None),
1059 # 3.17 Non-ASCII multibyte space character U+2000.
1060 ('\xe2\x80\x80',
1061 ' '),
1062 # 3.18 Zero Width Space U+200b.
1063 ('\xe2\x80\x8b',
1064 ''),
1065 # 3.19 Non-ASCII multibyte space character U+3000.
1066 ('\xe3\x80\x80',
1067 ' '),
1068 # 3.20 ASCII control characters U+0010 U+007F.
1069 ('\x10\x7f',
1070 '\x10\x7f'),
1071 # 3.21 Non-ASCII 8bit control character U+0085.
1072 ('\xc2\x85',
1073 None),
1074 # 3.22 Non-ASCII multibyte control character U+180E.
1075 ('\xe1\xa0\x8e',
1076 None),
1077 # 3.23 Zero Width No-Break Space U+FEFF.
1078 ('\xef\xbb\xbf',
1079 ''),
1080 # 3.24 Non-ASCII control character U+1D175.
1081 ('\xf0\x9d\x85\xb5',
1082 None),
1083 # 3.25 Plane 0 private use character U+F123.
1084 ('\xef\x84\xa3',
1085 None),
1086 # 3.26 Plane 15 private use character U+F1234.
1087 ('\xf3\xb1\x88\xb4',
1088 None),
1089 # 3.27 Plane 16 private use character U+10F234.
1090 ('\xf4\x8f\x88\xb4',
1091 None),
1092 # 3.28 Non-character code point U+8FFFE.
1093 ('\xf2\x8f\xbf\xbe',
1094 None),
1095 # 3.29 Non-character code point U+10FFFF.
1096 ('\xf4\x8f\xbf\xbf',
1097 None),
1098 # 3.30 Surrogate code U+DF42.
1099 ('\xed\xbd\x82',
1100 None),
1101 # 3.31 Non-plain text character U+FFFD.
1102 ('\xef\xbf\xbd',
1103 None),
1104 # 3.32 Ideographic description character U+2FF5.
1105 ('\xe2\xbf\xb5',
1106 None),
1107 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001108 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109 '\xcc\x81'),
1110 # 3.34 Left-to-right mark U+200E.
1111 ('\xe2\x80\x8e',
1112 None),
1113 # 3.35 Deprecated U+202A.
1114 ('\xe2\x80\xaa',
1115 None),
1116 # 3.36 Language tagging character U+E0001.
1117 ('\xf3\xa0\x80\x81',
1118 None),
1119 # 3.37 Language tagging character U+E0042.
1120 ('\xf3\xa0\x81\x82',
1121 None),
1122 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1123 ('foo\xd6\xbebar',
1124 None),
1125 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1126 ('foo\xef\xb5\x90bar',
1127 None),
1128 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1129 ('foo\xef\xb9\xb6bar',
1130 'foo \xd9\x8ebar'),
1131 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1132 ('\xd8\xa71',
1133 None),
1134 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1135 ('\xd8\xa71\xd8\xa8',
1136 '\xd8\xa71\xd8\xa8'),
1137 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001138 # Skip this test as we allow unassigned
1139 #('\xf3\xa0\x80\x82',
1140 # None),
1141 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142 # 3.44 Larger test (shrinking).
1143 # Original test case reads \xc3\xdf
1144 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1145 '\xaa\xce\xb0\xe2\x80\x80',
1146 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1147 # 3.45 Larger test (expanding).
1148 # Original test case reads \xc3\x9f
1149 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1150 '\x80',
1151 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1152 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1153 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1154 ]
1155
1156
1157class NameprepTest(unittest.TestCase):
1158 def test_nameprep(self):
1159 from encodings.idna import nameprep
1160 for pos, (orig, prepped) in enumerate(nameprep_tests):
1161 if orig is None:
1162 # Skipped
1163 continue
1164 # The Unicode strings are given in UTF-8
1165 orig = unicode(orig, "utf-8")
1166 if prepped is None:
1167 # Input contains prohibited characters
1168 self.assertRaises(UnicodeError, nameprep, orig)
1169 else:
1170 prepped = unicode(prepped, "utf-8")
1171 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001172 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 except Exception,e:
1174 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1175
Walter Dörwald78a0be62006-04-14 18:25:39 +00001176class IDNACodecTest(unittest.TestCase):
1177 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001178 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1179 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1180 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1181 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001182
1183 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001184 self.assertEqual(u"python.org".encode("idna"), "python.org")
1185 self.assertEqual("python.org.".encode("idna"), "python.org.")
1186 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1187 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001188
Martin v. Löwis8b595142005-08-25 11:03:38 +00001189 def test_stream(self):
1190 import StringIO
1191 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1192 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001193 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001194
Walter Dörwald78a0be62006-04-14 18:25:39 +00001195 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001196 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001197 "".join(codecs.iterdecode("python.org", "idna")),
1198 u"python.org"
1199 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001200 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001201 "".join(codecs.iterdecode("python.org.", "idna")),
1202 u"python.org."
1203 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001204 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001205 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1206 u"pyth\xf6n.org."
1207 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001208 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001209 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1210 u"pyth\xf6n.org."
1211 )
1212
1213 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001214 self.assertEqual(decoder.decode("xn--xam", ), u"")
1215 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1216 self.assertEqual(decoder.decode(u"rg"), u"")
1217 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001218
1219 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001220 self.assertEqual(decoder.decode("xn--xam", ), u"")
1221 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1222 self.assertEqual(decoder.decode("rg."), u"org.")
1223 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001224
1225 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001226 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001227 "".join(codecs.iterencode(u"python.org", "idna")),
1228 "python.org"
1229 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001230 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001231 "".join(codecs.iterencode(u"python.org.", "idna")),
1232 "python.org."
1233 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001234 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001235 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1236 "xn--pythn-mua.org."
1237 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001238 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001239 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1240 "xn--pythn-mua.org."
1241 )
1242
1243 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001244 self.assertEqual(encoder.encode(u"\xe4x"), "")
1245 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1246 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001247
1248 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001249 self.assertEqual(encoder.encode(u"\xe4x"), "")
1250 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1251 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001253class CodecsModuleTest(unittest.TestCase):
1254
1255 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001256 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001257 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001258 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001259 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001260 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1261
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001262 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001263 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001264 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001265 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001266 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001267 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001268 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1269
1270 def test_register(self):
1271 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001272 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001273
1274 def test_lookup(self):
1275 self.assertRaises(TypeError, codecs.lookup)
1276 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001277 self.assertRaises(LookupError, codecs.lookup, " ")
1278
1279 def test_getencoder(self):
1280 self.assertRaises(TypeError, codecs.getencoder)
1281 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1282
1283 def test_getdecoder(self):
1284 self.assertRaises(TypeError, codecs.getdecoder)
1285 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1286
1287 def test_getreader(self):
1288 self.assertRaises(TypeError, codecs.getreader)
1289 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1290
1291 def test_getwriter(self):
1292 self.assertRaises(TypeError, codecs.getwriter)
1293 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001294
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001295 def test_lookup_issue1813(self):
1296 # Issue #1813: under Turkish locales, lookup of some codecs failed
1297 # because 'I' is lowercased as a dotless "i"
1298 oldlocale = locale.getlocale(locale.LC_CTYPE)
1299 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1300 try:
1301 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1302 except locale.Error:
1303 # Unsupported locale on this system
1304 self.skipTest('test needs Turkish locale')
1305 c = codecs.lookup('ASCII')
1306 self.assertEqual(c.name, 'ascii')
1307
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001308class StreamReaderTest(unittest.TestCase):
1309
1310 def setUp(self):
1311 self.reader = codecs.getreader('utf-8')
1312 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1313
1314 def test_readlines(self):
1315 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001316 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001317
Georg Brandl8f99f812006-10-29 08:39:22 +00001318class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001319
Georg Brandl8f99f812006-10-29 08:39:22 +00001320 def test_basic(self):
1321 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001322 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001323 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001324
1325 f = StringIO.StringIO()
1326 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1327 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001328 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001329
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001330class Str2StrTest(unittest.TestCase):
1331
1332 def test_read(self):
1333 sin = "\x80".encode("base64_codec")
1334 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1335 sout = reader.read()
1336 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001337 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001338
1339 def test_readline(self):
1340 sin = "\x80".encode("base64_codec")
1341 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1342 sout = reader.readline()
1343 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001344 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001345
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001346all_unicode_encodings = [
1347 "ascii",
1348 "base64_codec",
1349 "big5",
1350 "big5hkscs",
1351 "charmap",
1352 "cp037",
1353 "cp1006",
1354 "cp1026",
1355 "cp1140",
1356 "cp1250",
1357 "cp1251",
1358 "cp1252",
1359 "cp1253",
1360 "cp1254",
1361 "cp1255",
1362 "cp1256",
1363 "cp1257",
1364 "cp1258",
1365 "cp424",
1366 "cp437",
1367 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001368 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001369 "cp737",
1370 "cp775",
1371 "cp850",
1372 "cp852",
1373 "cp855",
1374 "cp856",
1375 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001376 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001377 "cp860",
1378 "cp861",
1379 "cp862",
1380 "cp863",
1381 "cp864",
1382 "cp865",
1383 "cp866",
1384 "cp869",
1385 "cp874",
1386 "cp875",
1387 "cp932",
1388 "cp949",
1389 "cp950",
1390 "euc_jis_2004",
1391 "euc_jisx0213",
1392 "euc_jp",
1393 "euc_kr",
1394 "gb18030",
1395 "gb2312",
1396 "gbk",
1397 "hex_codec",
1398 "hp_roman8",
1399 "hz",
1400 "idna",
1401 "iso2022_jp",
1402 "iso2022_jp_1",
1403 "iso2022_jp_2",
1404 "iso2022_jp_2004",
1405 "iso2022_jp_3",
1406 "iso2022_jp_ext",
1407 "iso2022_kr",
1408 "iso8859_1",
1409 "iso8859_10",
1410 "iso8859_11",
1411 "iso8859_13",
1412 "iso8859_14",
1413 "iso8859_15",
1414 "iso8859_16",
1415 "iso8859_2",
1416 "iso8859_3",
1417 "iso8859_4",
1418 "iso8859_5",
1419 "iso8859_6",
1420 "iso8859_7",
1421 "iso8859_8",
1422 "iso8859_9",
1423 "johab",
1424 "koi8_r",
1425 "koi8_u",
1426 "latin_1",
1427 "mac_cyrillic",
1428 "mac_greek",
1429 "mac_iceland",
1430 "mac_latin2",
1431 "mac_roman",
1432 "mac_turkish",
1433 "palmos",
1434 "ptcp154",
1435 "punycode",
1436 "raw_unicode_escape",
1437 "rot_13",
1438 "shift_jis",
1439 "shift_jis_2004",
1440 "shift_jisx0213",
1441 "tis_620",
1442 "unicode_escape",
1443 "unicode_internal",
1444 "utf_16",
1445 "utf_16_be",
1446 "utf_16_le",
1447 "utf_7",
1448 "utf_8",
1449]
1450
1451if hasattr(codecs, "mbcs_encode"):
1452 all_unicode_encodings.append("mbcs")
1453
1454# The following encodings work only with str, not unicode
1455all_string_encodings = [
1456 "quopri_codec",
1457 "string_escape",
1458 "uu_codec",
1459]
1460
1461# The following encoding is not tested, because it's not supposed
1462# to work:
1463# "undefined"
1464
1465# The following encodings don't work in stateful mode
1466broken_unicode_with_streams = [
1467 "base64_codec",
1468 "hex_codec",
1469 "punycode",
1470 "unicode_internal"
1471]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001472broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001473
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001474# The following encodings only support "strict" mode
1475only_strict_mode = [
1476 "idna",
1477 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001478 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001479]
1480
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001481try:
1482 import bz2
1483except ImportError:
1484 pass
1485else:
1486 all_unicode_encodings.append("bz2_codec")
1487 broken_unicode_with_streams.append("bz2_codec")
1488
1489try:
1490 import zlib
1491except ImportError:
1492 pass
1493else:
1494 all_unicode_encodings.append("zlib_codec")
1495 broken_unicode_with_streams.append("zlib_codec")
1496
1497class BasicUnicodeTest(unittest.TestCase):
1498 def test_basics(self):
1499 s = u"abc123" # all codecs should be able to encode these
1500 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001501 name = codecs.lookup(encoding).name
1502 if encoding.endswith("_codec"):
1503 name += "_codec"
1504 elif encoding == "latin_1":
1505 name = "latin_1"
1506 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001507 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001508 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001509 (chars, size) = codecs.getdecoder(encoding)(bytes)
1510 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1511
1512 if encoding not in broken_unicode_with_streams:
1513 # check stream reader/writer
1514 q = Queue()
1515 writer = codecs.getwriter(encoding)(q)
1516 encodedresult = ""
1517 for c in s:
1518 writer.write(c)
1519 encodedresult += q.read()
1520 q = Queue()
1521 reader = codecs.getreader(encoding)(q)
1522 decodedresult = u""
1523 for c in encodedresult:
1524 q.write(c)
1525 decodedresult += reader.read()
1526 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1527
Georg Brandl2c9838e2006-10-29 14:39:09 +00001528 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001529 # check incremental decoder/encoder (fetched via the Python
1530 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001531 try:
1532 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001533 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001534 except LookupError: # no IncrementalEncoder
1535 pass
1536 else:
1537 # check incremental decoder/encoder
1538 encodedresult = ""
1539 for c in s:
1540 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001541 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001542 decoder = codecs.getincrementaldecoder(encoding)()
1543 decodedresult = u""
1544 for c in encodedresult:
1545 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001546 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001547 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1548
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001549 # check C API
1550 encodedresult = ""
1551 for c in s:
1552 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001553 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001554 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1555 decodedresult = u""
1556 for c in encodedresult:
1557 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001558 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001559 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1560
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001561 # check iterencode()/iterdecode()
1562 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1563 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1564
1565 # check iterencode()/iterdecode() with empty string
1566 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1567 self.assertEqual(result, u"")
1568
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001569 if encoding not in only_strict_mode:
1570 # check incremental decoder/encoder with errors argument
1571 try:
1572 encoder = codecs.getincrementalencoder(encoding)("ignore")
1573 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1574 except LookupError: # no IncrementalEncoder
1575 pass
1576 else:
1577 encodedresult = "".join(encoder.encode(c) for c in s)
1578 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1579 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1580 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001581
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001582 encodedresult = "".join(cencoder.encode(c) for c in s)
1583 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1584 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1585 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1586
Walter Dörwald729c31f2005-03-14 19:06:30 +00001587 def test_seek(self):
1588 # all codecs should be able to encode these
1589 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1590 for encoding in all_unicode_encodings:
1591 if encoding == "idna": # FIXME: See SF bug #1163178
1592 continue
1593 if encoding in broken_unicode_with_streams:
1594 continue
1595 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1596 for t in xrange(5):
1597 # Test that calling seek resets the internal codec state and buffers
1598 reader.seek(0, 0)
1599 line = reader.readline()
1600 self.assertEqual(s[:len(line)], line)
1601
Walter Dörwalde22d3392005-11-17 08:52:34 +00001602 def test_bad_decode_args(self):
1603 for encoding in all_unicode_encodings:
1604 decoder = codecs.getdecoder(encoding)
1605 self.assertRaises(TypeError, decoder)
1606 if encoding not in ("idna", "punycode"):
1607 self.assertRaises(TypeError, decoder, 42)
1608
1609 def test_bad_encode_args(self):
1610 for encoding in all_unicode_encodings:
1611 encoder = codecs.getencoder(encoding)
1612 self.assertRaises(TypeError, encoder)
1613
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001614 def test_encoding_map_type_initialized(self):
1615 from encodings import cp1140
1616 # This used to crash, we are only verifying there's no crash.
1617 table_type = type(cp1140.encoding_table)
1618 self.assertEqual(table_type, table_type)
1619
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001620class BasicStrTest(unittest.TestCase):
1621 def test_basics(self):
1622 s = "abc123"
1623 for encoding in all_string_encodings:
1624 (bytes, size) = codecs.getencoder(encoding)(s)
1625 self.assertEqual(size, len(s))
1626 (chars, size) = codecs.getdecoder(encoding)(bytes)
1627 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1628
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001629class CharmapTest(unittest.TestCase):
1630 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001631 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001632 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1633 (u"abc", 3)
1634 )
1635
Serhiy Storchaka95997452013-01-15 14:42:59 +02001636 self.assertRaises(UnicodeDecodeError,
1637 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1638 )
1639
1640 self.assertRaises(UnicodeDecodeError,
1641 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1642 )
1643
Ezio Melotti2623a372010-11-21 13:34:58 +00001644 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001645 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1646 (u"ab\ufffd", 3)
1647 )
1648
Ezio Melotti2623a372010-11-21 13:34:58 +00001649 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001650 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1651 (u"ab\ufffd", 3)
1652 )
1653
Ezio Melotti2623a372010-11-21 13:34:58 +00001654 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001655 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1656 (u"ab", 3)
1657 )
1658
Ezio Melotti2623a372010-11-21 13:34:58 +00001659 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001660 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1661 (u"ab", 3)
1662 )
1663
1664 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001665 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001666 codecs.charmap_decode(allbytes, "ignore", u""),
1667 (u"", len(allbytes))
1668 )
1669
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001670 def test_decode_with_int2str_map(self):
1671 self.assertEqual(
1672 codecs.charmap_decode("\x00\x01\x02", "strict",
1673 {0: u'a', 1: u'b', 2: u'c'}),
1674 (u"abc", 3)
1675 )
1676
1677 self.assertEqual(
1678 codecs.charmap_decode("\x00\x01\x02", "strict",
1679 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1680 (u"AaBbCc", 3)
1681 )
1682
1683 self.assertEqual(
1684 codecs.charmap_decode("\x00\x01\x02", "strict",
1685 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1686 (u"\U0010FFFFbc", 3)
1687 )
1688
1689 self.assertEqual(
1690 codecs.charmap_decode("\x00\x01\x02", "strict",
1691 {0: u'a', 1: u'b', 2: u''}),
1692 (u"ab", 3)
1693 )
1694
1695 self.assertRaises(UnicodeDecodeError,
1696 codecs.charmap_decode, "\x00\x01\x02", "strict",
1697 {0: u'a', 1: u'b'}
1698 )
1699
Serhiy Storchaka95997452013-01-15 14:42:59 +02001700 self.assertRaises(UnicodeDecodeError,
1701 codecs.charmap_decode, "\x00\x01\x02", "strict",
1702 {0: u'a', 1: u'b', 2: None}
1703 )
1704
1705 # Issue #14850
1706 self.assertRaises(UnicodeDecodeError,
1707 codecs.charmap_decode, "\x00\x01\x02", "strict",
1708 {0: u'a', 1: u'b', 2: u'\ufffe'}
1709 )
1710
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001711 self.assertEqual(
1712 codecs.charmap_decode("\x00\x01\x02", "replace",
1713 {0: u'a', 1: u'b'}),
1714 (u"ab\ufffd", 3)
1715 )
1716
1717 self.assertEqual(
1718 codecs.charmap_decode("\x00\x01\x02", "replace",
1719 {0: u'a', 1: u'b', 2: None}),
1720 (u"ab\ufffd", 3)
1721 )
1722
Serhiy Storchaka95997452013-01-15 14:42:59 +02001723 # Issue #14850
1724 self.assertEqual(
1725 codecs.charmap_decode("\x00\x01\x02", "replace",
1726 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1727 (u"ab\ufffd", 3)
1728 )
1729
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001730 self.assertEqual(
1731 codecs.charmap_decode("\x00\x01\x02", "ignore",
1732 {0: u'a', 1: u'b'}),
1733 (u"ab", 3)
1734 )
1735
1736 self.assertEqual(
1737 codecs.charmap_decode("\x00\x01\x02", "ignore",
1738 {0: u'a', 1: u'b', 2: None}),
1739 (u"ab", 3)
1740 )
1741
Serhiy Storchaka95997452013-01-15 14:42:59 +02001742 # Issue #14850
1743 self.assertEqual(
1744 codecs.charmap_decode("\x00\x01\x02", "ignore",
1745 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1746 (u"ab", 3)
1747 )
1748
1749 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001750 self.assertEqual(
1751 codecs.charmap_decode(allbytes, "ignore", {}),
1752 (u"", len(allbytes))
1753 )
1754
1755 def test_decode_with_int2int_map(self):
1756 a = ord(u'a')
1757 b = ord(u'b')
1758 c = ord(u'c')
1759
1760 self.assertEqual(
1761 codecs.charmap_decode("\x00\x01\x02", "strict",
1762 {0: a, 1: b, 2: c}),
1763 (u"abc", 3)
1764 )
1765
1766 # Issue #15379
1767 self.assertEqual(
1768 codecs.charmap_decode("\x00\x01\x02", "strict",
1769 {0: 0x10FFFF, 1: b, 2: c}),
1770 (u"\U0010FFFFbc", 3)
1771 )
1772
1773 self.assertRaises(TypeError,
1774 codecs.charmap_decode, "\x00\x01\x02", "strict",
1775 {0: 0x110000, 1: b, 2: c}
1776 )
1777
1778 self.assertRaises(UnicodeDecodeError,
1779 codecs.charmap_decode, "\x00\x01\x02", "strict",
1780 {0: a, 1: b},
1781 )
1782
Serhiy Storchaka95997452013-01-15 14:42:59 +02001783 self.assertRaises(UnicodeDecodeError,
1784 codecs.charmap_decode, "\x00\x01\x02", "strict",
1785 {0: a, 1: b, 2: 0xFFFE},
1786 )
1787
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001788 self.assertEqual(
1789 codecs.charmap_decode("\x00\x01\x02", "replace",
1790 {0: a, 1: b}),
1791 (u"ab\ufffd", 3)
1792 )
1793
1794 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001795 codecs.charmap_decode("\x00\x01\x02", "replace",
1796 {0: a, 1: b, 2: 0xFFFE}),
1797 (u"ab\ufffd", 3)
1798 )
1799
1800 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001801 codecs.charmap_decode("\x00\x01\x02", "ignore",
1802 {0: a, 1: b}),
1803 (u"ab", 3)
1804 )
1805
Serhiy Storchaka95997452013-01-15 14:42:59 +02001806 self.assertEqual(
1807 codecs.charmap_decode("\x00\x01\x02", "ignore",
1808 {0: a, 1: b, 2: 0xFFFE}),
1809 (u"ab", 3)
1810 )
1811
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001812
Georg Brandl8f99f812006-10-29 08:39:22 +00001813class WithStmtTest(unittest.TestCase):
1814 def test_encodedfile(self):
1815 f = StringIO.StringIO("\xc3\xbc")
1816 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001817 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001818
1819 def test_streamreaderwriter(self):
1820 f = StringIO.StringIO("\xc3\xbc")
1821 info = codecs.lookup("utf-8")
1822 with codecs.StreamReaderWriter(f, info.streamreader,
1823 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001824 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001825
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001826
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001827class UnicodeEscapeTest(unittest.TestCase):
1828 def test_empty(self):
1829 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1830 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1831
1832 def test_raw_encode(self):
1833 encode = codecs.unicode_escape_encode
1834 for b in range(32, 127):
1835 if b != ord('\\'):
1836 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1837
1838 def test_raw_decode(self):
1839 decode = codecs.unicode_escape_decode
1840 for b in range(256):
1841 if b != ord('\\'):
1842 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1843
1844 def test_escape_encode(self):
1845 encode = codecs.unicode_escape_encode
1846 check = coding_checker(self, encode)
1847 check(u'\t', r'\t')
1848 check(u'\n', r'\n')
1849 check(u'\r', r'\r')
1850 check(u'\\', r'\\')
1851 for b in range(32):
1852 if chr(b) not in '\t\n\r':
1853 check(unichr(b), '\\x%02x' % b)
1854 for b in range(127, 256):
1855 check(unichr(b), '\\x%02x' % b)
1856 check(u'\u20ac', r'\u20ac')
1857 check(u'\U0001d120', r'\U0001d120')
1858
1859 def test_escape_decode(self):
1860 decode = codecs.unicode_escape_decode
1861 check = coding_checker(self, decode)
1862 check("[\\\n]", u"[]")
1863 check(r'[\"]', u'["]')
1864 check(r"[\']", u"[']")
1865 check(r"[\\]", ur"[\]")
1866 check(r"[\a]", u"[\x07]")
1867 check(r"[\b]", u"[\x08]")
1868 check(r"[\t]", u"[\x09]")
1869 check(r"[\n]", u"[\x0a]")
1870 check(r"[\v]", u"[\x0b]")
1871 check(r"[\f]", u"[\x0c]")
1872 check(r"[\r]", u"[\x0d]")
1873 check(r"[\7]", u"[\x07]")
1874 check(r"[\8]", ur"[\8]")
1875 check(r"[\78]", u"[\x078]")
1876 check(r"[\41]", u"[!]")
1877 check(r"[\418]", u"[!8]")
1878 check(r"[\101]", u"[A]")
1879 check(r"[\1010]", u"[A0]")
1880 check(r"[\x41]", u"[A]")
1881 check(r"[\x410]", u"[A0]")
1882 check(r"\u20ac", u"\u20ac")
1883 check(r"\U0001d120", u"\U0001d120")
1884 for b in range(256):
1885 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1886 check('\\' + chr(b), u'\\' + unichr(b))
1887
1888 def test_decode_errors(self):
1889 decode = codecs.unicode_escape_decode
1890 for c, d in ('x', 2), ('u', 4), ('U', 4):
1891 for i in range(d):
1892 self.assertRaises(UnicodeDecodeError, decode,
1893 "\\" + c + "0"*i)
1894 self.assertRaises(UnicodeDecodeError, decode,
1895 "[\\" + c + "0"*i + "]")
1896 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1897 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1898 self.assertEqual(decode(data, "replace"),
1899 (u"[\ufffd]\ufffd", len(data)))
1900 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1901 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1902 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1903
1904
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001905class RawUnicodeEscapeTest(unittest.TestCase):
1906 def test_empty(self):
1907 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1908 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1909
1910 def test_raw_encode(self):
1911 encode = codecs.raw_unicode_escape_encode
1912 for b in range(256):
1913 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1914
1915 def test_raw_decode(self):
1916 decode = codecs.raw_unicode_escape_decode
1917 for b in range(256):
1918 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1919
1920 def test_escape_encode(self):
1921 encode = codecs.raw_unicode_escape_encode
1922 check = coding_checker(self, encode)
1923 for b in range(256):
1924 if chr(b) not in 'uU':
1925 check(u'\\' + unichr(b), '\\' + chr(b))
1926 check(u'\u20ac', r'\u20ac')
1927 check(u'\U0001d120', r'\U0001d120')
1928
1929 def test_escape_decode(self):
1930 decode = codecs.raw_unicode_escape_decode
1931 check = coding_checker(self, decode)
1932 for b in range(256):
1933 if chr(b) not in 'uU':
1934 check('\\' + chr(b), u'\\' + unichr(b))
1935 check(r"\u20ac", u"\u20ac")
1936 check(r"\U0001d120", u"\U0001d120")
1937
1938 def test_decode_errors(self):
1939 decode = codecs.raw_unicode_escape_decode
1940 for c, d in ('u', 4), ('U', 4):
1941 for i in range(d):
1942 self.assertRaises(UnicodeDecodeError, decode,
1943 "\\" + c + "0"*i)
1944 self.assertRaises(UnicodeDecodeError, decode,
1945 "[\\" + c + "0"*i + "]")
1946 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1947 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1948 self.assertEqual(decode(data, "replace"),
1949 (u"[\ufffd]\ufffd", len(data)))
1950 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1951 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1952 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1953
1954
Victor Stinner262be5e2010-05-22 02:11:07 +00001955class BomTest(unittest.TestCase):
1956 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00001957 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00001958 tests = ("utf-16",
1959 "utf-16-le",
1960 "utf-16-be",
1961 "utf-32",
1962 "utf-32-le",
1963 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02001964 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00001965 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00001966 # Check if the BOM is written only once
1967 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00001968 f.write(data)
1969 f.write(data)
1970 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001971 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001972 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001973 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00001974
Victor Stinner7df55da2010-05-22 13:37:56 +00001975 # Check that the BOM is written after a seek(0)
1976 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1977 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001978 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001979 f.seek(0)
1980 f.write(data)
1981 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001982 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001983
1984 # (StreamWriter) Check that the BOM is written after a seek(0)
1985 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1986 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00001987 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00001988 f.writer.seek(0)
1989 f.writer.write(data)
1990 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00001991 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00001992
1993 # Check that the BOM is not written after a seek() at a position
1994 # different than the start
1995 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1996 f.write(data)
1997 f.seek(f.tell())
1998 f.write(data)
1999 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002000 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002001
2002 # (StreamWriter) Check that the BOM is not written after a seek()
2003 # at a position different than the start
2004 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2005 f.writer.write(data)
2006 f.writer.seek(f.writer.tell())
2007 f.writer.write(data)
2008 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002009 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002010
Victor Stinner262be5e2010-05-22 02:11:07 +00002011
Fred Drake2e2be372001-09-20 21:33:42 +00002012def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002013 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002014 UTF32Test,
2015 UTF32LETest,
2016 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002017 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002018 UTF16LETest,
2019 UTF16BETest,
2020 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002021 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002022 UTF7Test,
2023 UTF16ExTest,
2024 ReadBufferTest,
2025 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002026 EscapeDecodeTest,
2027 RecodingTest,
2028 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002029 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002030 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002031 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002032 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002033 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002034 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002035 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002036 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002037 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002038 CharmapTest,
2039 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002040 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002041 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002042 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002043 )
Fred Drake2e2be372001-09-20 21:33:42 +00002044
2045
2046if __name__ == "__main__":
2047 test_main()