blob: 74ad83bc04bc4642501f412e06c5b2cd6fcb316d [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +000076 s = 10*(size*u"a" + lineend + u"xxx\n")
77 reader = getreader(s)
78 for i in xrange(10):
79 self.assertEqual(
80 reader.readline(keepends=True),
81 size*u"a" + lineend,
82 )
83 reader = getreader(s)
84 for i in xrange(10):
85 self.assertEqual(
86 reader.readline(keepends=False),
87 size*u"a",
88 )
89
90 def test_bug1175396(self):
91 s = [
92 '<%!--===================================================\r\n',
93 ' BLOG index page: show recent articles,\r\n',
94 ' today\'s articles, or articles of a specific date.\r\n',
95 '========================================================--%>\r\n',
96 '<%@inputencoding="ISO-8859-1"%>\r\n',
97 '<%@pagetemplate=TEMPLATE.y%>\r\n',
98 '<%@import=import frog.util, frog%>\r\n',
99 '<%@import=import frog.objects%>\r\n',
100 '<%@import=from frog.storageerrors import StorageError%>\r\n',
101 '<%\r\n',
102 '\r\n',
103 'import logging\r\n',
104 'log=logging.getLogger("Snakelets.logger")\r\n',
105 '\r\n',
106 '\r\n',
107 'user=self.SessionCtx.user\r\n',
108 'storageEngine=self.SessionCtx.storageEngine\r\n',
109 '\r\n',
110 '\r\n',
111 'def readArticlesFromDate(date, count=None):\r\n',
112 ' entryids=storageEngine.listBlogEntries(date)\r\n',
113 ' entryids.reverse() # descending\r\n',
114 ' if count:\r\n',
115 ' entryids=entryids[:count]\r\n',
116 ' try:\r\n',
117 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
118 ' except StorageError,x:\r\n',
119 ' log.error("Error loading articles: "+str(x))\r\n',
120 ' self.abort("cannot load articles")\r\n',
121 '\r\n',
122 'showdate=None\r\n',
123 '\r\n',
124 'arg=self.Request.getArg()\r\n',
125 'if arg=="today":\r\n',
126 ' #-------------------- TODAY\'S ARTICLES\r\n',
127 ' self.write("<h2>Today\'s articles</h2>")\r\n',
128 ' showdate = frog.util.isodatestr() \r\n',
129 ' entries = readArticlesFromDate(showdate)\r\n',
130 'elif arg=="active":\r\n',
131 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
132 ' self.Yredirect("active.y")\r\n',
133 'elif arg=="login":\r\n',
134 ' #-------------------- LOGIN PAGE redirect\r\n',
135 ' self.Yredirect("login.y")\r\n',
136 'elif arg=="date":\r\n',
137 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
138 ' showdate = self.Request.getParameter("date")\r\n',
139 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
140 ' entries = readArticlesFromDate(showdate)\r\n',
141 'else:\r\n',
142 ' #-------------------- RECENT ARTICLES\r\n',
143 ' self.write("<h2>Recent articles</h2>")\r\n',
144 ' dates=storageEngine.listBlogEntryDates()\r\n',
145 ' if dates:\r\n',
146 ' entries=[]\r\n',
147 ' SHOWAMOUNT=10\r\n',
148 ' for showdate in dates:\r\n',
149 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
150 ' if len(entries)>=SHOWAMOUNT:\r\n',
151 ' break\r\n',
152 ' \r\n',
153 ]
154 stream = StringIO.StringIO("".join(s).encode(self.encoding))
155 reader = codecs.getreader(self.encoding)(stream)
156 for (i, line) in enumerate(reader):
157 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000158
159 def test_readlinequeue(self):
160 q = Queue()
161 writer = codecs.getwriter(self.encoding)(q)
162 reader = codecs.getreader(self.encoding)(q)
163
164 # No lineends
165 writer.write(u"foo\r")
166 self.assertEqual(reader.readline(keepends=False), u"foo")
167 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000168 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000169 self.assertEqual(reader.readline(keepends=False), u"bar")
170 writer.write(u"baz")
171 self.assertEqual(reader.readline(keepends=False), u"baz")
172 self.assertEqual(reader.readline(keepends=False), u"")
173
174 # Lineends
175 writer.write(u"foo\r")
176 self.assertEqual(reader.readline(keepends=True), u"foo\r")
177 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000178 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000179 self.assertEqual(reader.readline(keepends=True), u"bar\r")
180 writer.write(u"baz")
181 self.assertEqual(reader.readline(keepends=True), u"baz")
182 self.assertEqual(reader.readline(keepends=True), u"")
183 writer.write(u"foo\r\n")
184 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
185
Walter Dörwald9fa09462005-01-10 12:01:39 +0000186 def test_bug1098990_a(self):
187 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
188 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
189 s3 = u"next line.\r\n"
190
191 s = (s1+s2+s3).encode(self.encoding)
192 stream = StringIO.StringIO(s)
193 reader = codecs.getreader(self.encoding)(stream)
194 self.assertEqual(reader.readline(), s1)
195 self.assertEqual(reader.readline(), s2)
196 self.assertEqual(reader.readline(), s3)
197 self.assertEqual(reader.readline(), u"")
198
199 def test_bug1098990_b(self):
200 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
201 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
202 s3 = u"stillokay:bbbbxx\r\n"
203 s4 = u"broken!!!!badbad\r\n"
204 s5 = u"againokay.\r\n"
205
206 s = (s1+s2+s3+s4+s5).encode(self.encoding)
207 stream = StringIO.StringIO(s)
208 reader = codecs.getreader(self.encoding)(stream)
209 self.assertEqual(reader.readline(), s1)
210 self.assertEqual(reader.readline(), s2)
211 self.assertEqual(reader.readline(), s3)
212 self.assertEqual(reader.readline(), s4)
213 self.assertEqual(reader.readline(), s5)
214 self.assertEqual(reader.readline(), u"")
215
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216class UTF16Test(ReadTest):
217 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000218
219 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
220 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
221
222 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000223 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000224 # encode some stream
225 s = StringIO.StringIO()
226 f = writer(s)
227 f.write(u"spam")
228 f.write(u"spam")
229 d = s.getvalue()
230 # check whether there is exactly one BOM in it
231 self.assert_(d == self.spamle or d == self.spambe)
232 # try to read it back
233 s = StringIO.StringIO(d)
234 f = reader(s)
235 self.assertEquals(f.read(), u"spamspam")
236
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000237 def test_badbom(self):
238 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000239 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000240 self.assertRaises(UnicodeError, f.read)
241
242 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000243 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000244 self.assertRaises(UnicodeError, f.read)
245
Walter Dörwald69652032004-09-07 20:24:22 +0000246 def test_partial(self):
247 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000248 u"\x00\xff\u0100\uffff",
249 [
250 u"", # first byte of BOM read
251 u"", # second byte of BOM read => byteorder known
252 u"",
253 u"\x00",
254 u"\x00",
255 u"\x00\xff",
256 u"\x00\xff",
257 u"\x00\xff\u0100",
258 u"\x00\xff\u0100",
259 u"\x00\xff\u0100\uffff",
260 ]
261 )
262
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000263class UTF16LETest(ReadTest):
264 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000265
266 def test_partial(self):
267 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000268 u"\x00\xff\u0100\uffff",
269 [
270 u"",
271 u"\x00",
272 u"\x00",
273 u"\x00\xff",
274 u"\x00\xff",
275 u"\x00\xff\u0100",
276 u"\x00\xff\u0100",
277 u"\x00\xff\u0100\uffff",
278 ]
279 )
280
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281class UTF16BETest(ReadTest):
282 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000283
284 def test_partial(self):
285 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000286 u"\x00\xff\u0100\uffff",
287 [
288 u"",
289 u"\x00",
290 u"\x00",
291 u"\x00\xff",
292 u"\x00\xff",
293 u"\x00\xff\u0100",
294 u"\x00\xff\u0100",
295 u"\x00\xff\u0100\uffff",
296 ]
297 )
298
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299class UTF8Test(ReadTest):
300 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000301
302 def test_partial(self):
303 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000304 u"\x00\xff\u07ff\u0800\uffff",
305 [
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u07ff",
311 u"\x00\xff\u07ff",
312 u"\x00\xff\u07ff",
313 u"\x00\xff\u07ff\u0800",
314 u"\x00\xff\u07ff\u0800",
315 u"\x00\xff\u07ff\u0800",
316 u"\x00\xff\u07ff\u0800\uffff",
317 ]
318 )
319
Walter Dörwald8709a422002-09-03 13:53:40 +0000320class EscapeDecodeTest(unittest.TestCase):
321 def test_empty_escape_decode(self):
322 self.assertEquals(codecs.escape_decode(""), ("", 0))
323
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000324class RecodingTest(unittest.TestCase):
325 def test_recoding(self):
326 f = StringIO.StringIO()
327 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
328 f2.write(u"a")
329 f2.close()
330 # Python used to crash on this at exit because of a refcount
331 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000332
Martin v. Löwis2548c732003-04-18 10:39:54 +0000333# From RFC 3492
334punycode_testcases = [
335 # A Arabic (Egyptian):
336 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
337 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
338 "egbpdaj6bu4bxfgehfvwxn"),
339 # B Chinese (simplified):
340 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
341 "ihqwcrb4cv8a8dqg056pqjye"),
342 # C Chinese (traditional):
343 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
344 "ihqwctvzc91f659drss3x8bo0yb"),
345 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
346 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
347 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
348 u"\u0065\u0073\u006B\u0079",
349 "Proprostnemluvesky-uyb24dma41a"),
350 # E Hebrew:
351 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
352 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
353 u"\u05D1\u05E8\u05D9\u05EA",
354 "4dbcagdahymbxekheh6e0a7fei0b"),
355 # F Hindi (Devanagari):
356 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
357 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
358 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
359 u"\u0939\u0948\u0902",
360 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
361
362 #(G) Japanese (kanji and hiragana):
363 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
364 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
365 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
366
367 # (H) Korean (Hangul syllables):
368 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
369 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
370 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
371 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
372 "psd879ccm6fea98c"),
373
374 # (I) Russian (Cyrillic):
375 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
376 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
377 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
378 u"\u0438",
379 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
380
381 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
382 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
383 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
384 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
385 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
386 u"\u0061\u00F1\u006F\u006C",
387 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
388
389 # (K) Vietnamese:
390 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
391 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
392 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
393 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
394 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
395 u"\u0056\u0069\u1EC7\u0074",
396 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
397
398
399 #(L) 3<nen>B<gumi><kinpachi><sensei>
400 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
401 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000402
Martin v. Löwis2548c732003-04-18 10:39:54 +0000403 # (M) <amuro><namie>-with-SUPER-MONKEYS
404 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
405 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
406 u"\u004F\u004E\u004B\u0045\u0059\u0053",
407 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
408
409 # (N) Hello-Another-Way-<sorezore><no><basho>
410 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
411 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
412 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
413 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
414
415 # (O) <hitotsu><yane><no><shita>2
416 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
417 "2-u9tlzr9756bt3uc0v"),
418
419 # (P) Maji<de>Koi<suru>5<byou><mae>
420 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
421 u"\u308B\u0035\u79D2\u524D",
422 "MajiKoi5-783gue6qz075azm5e"),
423
424 # (Q) <pafii>de<runba>
425 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
426 "de-jg4avhby1noc0d"),
427
428 # (R) <sono><supiido><de>
429 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
430 "d9juau41awczczp"),
431
432 # (S) -> $1.00 <-
433 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
434 u"\u003C\u002D",
435 "-> $1.00 <--")
436 ]
437
438for i in punycode_testcases:
439 if len(i)!=2:
440 print repr(i)
441
442class PunycodeTest(unittest.TestCase):
443 def test_encode(self):
444 for uni, puny in punycode_testcases:
445 # Need to convert both strings to lower case, since
446 # some of the extended encodings use upper case, but our
447 # code produces only lower case. Converting just puny to
448 # lower is also insufficient, since some of the input characters
449 # are upper case.
450 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
451
452 def test_decode(self):
453 for uni, puny in punycode_testcases:
454 self.assertEquals(uni, puny.decode("punycode"))
455
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000456class UnicodeInternalTest(unittest.TestCase):
457 def test_bug1251300(self):
458 # Decoding with unicode_internal used to not correctly handle "code
459 # points" above 0x10ffff on UCS-4 builds.
460 if sys.maxunicode > 0xffff:
461 ok = [
462 ("\x00\x10\xff\xff", u"\U0010ffff"),
463 ("\x00\x00\x01\x01", u"\U00000101"),
464 ("", u""),
465 ]
466 not_ok = [
467 "\x7f\xff\xff\xff",
468 "\x80\x00\x00\x00",
469 "\x81\x00\x00\x00",
470 "\x00",
471 "\x00\x00\x00\x00\x00",
472 ]
473 for internal, uni in ok:
474 if sys.byteorder == "little":
475 internal = "".join(reversed(internal))
476 self.assertEquals(uni, internal.decode("unicode_internal"))
477 for internal in not_ok:
478 if sys.byteorder == "little":
479 internal = "".join(reversed(internal))
480 self.assertRaises(UnicodeDecodeError, internal.decode,
481 "unicode_internal")
482
483 def test_decode_error_attributes(self):
484 if sys.maxunicode > 0xffff:
485 try:
486 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
487 except UnicodeDecodeError, ex:
488 self.assertEquals("unicode_internal", ex.encoding)
489 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
490 self.assertEquals(4, ex.start)
491 self.assertEquals(8, ex.end)
492 else:
493 self.fail()
494
495 def test_decode_callback(self):
496 if sys.maxunicode > 0xffff:
497 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
498 decoder = codecs.getdecoder("unicode_internal")
499 ab = u"ab".encode("unicode_internal")
500 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
501 "UnicodeInternalTest")
502 self.assertEquals((u"ab", 12), ignored)
503
Martin v. Löwis2548c732003-04-18 10:39:54 +0000504# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
505nameprep_tests = [
506 # 3.1 Map to nothing.
507 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
508 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
509 '\xb8\x8f\xef\xbb\xbf',
510 'foobarbaz'),
511 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
512 ('CAFE',
513 'cafe'),
514 # 3.3 Case folding 8bit U+00DF (german sharp s).
515 # The original test case is bogus; it says \xc3\xdf
516 ('\xc3\x9f',
517 'ss'),
518 # 3.4 Case folding U+0130 (turkish capital I with dot).
519 ('\xc4\xb0',
520 'i\xcc\x87'),
521 # 3.5 Case folding multibyte U+0143 U+037A.
522 ('\xc5\x83\xcd\xba',
523 '\xc5\x84 \xce\xb9'),
524 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
525 # XXX: skip this as it fails in UCS-2 mode
526 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
527 # 'telc\xe2\x88\x95kg\xcf\x83'),
528 (None, None),
529 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
530 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
531 '\xc7\xb0 a'),
532 # 3.8 Case folding U+1FB7 and normalization.
533 ('\xe1\xbe\xb7',
534 '\xe1\xbe\xb6\xce\xb9'),
535 # 3.9 Self-reverting case folding U+01F0 and normalization.
536 # The original test case is bogus, it says `\xc7\xf0'
537 ('\xc7\xb0',
538 '\xc7\xb0'),
539 # 3.10 Self-reverting case folding U+0390 and normalization.
540 ('\xce\x90',
541 '\xce\x90'),
542 # 3.11 Self-reverting case folding U+03B0 and normalization.
543 ('\xce\xb0',
544 '\xce\xb0'),
545 # 3.12 Self-reverting case folding U+1E96 and normalization.
546 ('\xe1\xba\x96',
547 '\xe1\xba\x96'),
548 # 3.13 Self-reverting case folding U+1F56 and normalization.
549 ('\xe1\xbd\x96',
550 '\xe1\xbd\x96'),
551 # 3.14 ASCII space character U+0020.
552 (' ',
553 ' '),
554 # 3.15 Non-ASCII 8bit space character U+00A0.
555 ('\xc2\xa0',
556 ' '),
557 # 3.16 Non-ASCII multibyte space character U+1680.
558 ('\xe1\x9a\x80',
559 None),
560 # 3.17 Non-ASCII multibyte space character U+2000.
561 ('\xe2\x80\x80',
562 ' '),
563 # 3.18 Zero Width Space U+200b.
564 ('\xe2\x80\x8b',
565 ''),
566 # 3.19 Non-ASCII multibyte space character U+3000.
567 ('\xe3\x80\x80',
568 ' '),
569 # 3.20 ASCII control characters U+0010 U+007F.
570 ('\x10\x7f',
571 '\x10\x7f'),
572 # 3.21 Non-ASCII 8bit control character U+0085.
573 ('\xc2\x85',
574 None),
575 # 3.22 Non-ASCII multibyte control character U+180E.
576 ('\xe1\xa0\x8e',
577 None),
578 # 3.23 Zero Width No-Break Space U+FEFF.
579 ('\xef\xbb\xbf',
580 ''),
581 # 3.24 Non-ASCII control character U+1D175.
582 ('\xf0\x9d\x85\xb5',
583 None),
584 # 3.25 Plane 0 private use character U+F123.
585 ('\xef\x84\xa3',
586 None),
587 # 3.26 Plane 15 private use character U+F1234.
588 ('\xf3\xb1\x88\xb4',
589 None),
590 # 3.27 Plane 16 private use character U+10F234.
591 ('\xf4\x8f\x88\xb4',
592 None),
593 # 3.28 Non-character code point U+8FFFE.
594 ('\xf2\x8f\xbf\xbe',
595 None),
596 # 3.29 Non-character code point U+10FFFF.
597 ('\xf4\x8f\xbf\xbf',
598 None),
599 # 3.30 Surrogate code U+DF42.
600 ('\xed\xbd\x82',
601 None),
602 # 3.31 Non-plain text character U+FFFD.
603 ('\xef\xbf\xbd',
604 None),
605 # 3.32 Ideographic description character U+2FF5.
606 ('\xe2\xbf\xb5',
607 None),
608 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000609 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000610 '\xcc\x81'),
611 # 3.34 Left-to-right mark U+200E.
612 ('\xe2\x80\x8e',
613 None),
614 # 3.35 Deprecated U+202A.
615 ('\xe2\x80\xaa',
616 None),
617 # 3.36 Language tagging character U+E0001.
618 ('\xf3\xa0\x80\x81',
619 None),
620 # 3.37 Language tagging character U+E0042.
621 ('\xf3\xa0\x81\x82',
622 None),
623 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
624 ('foo\xd6\xbebar',
625 None),
626 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
627 ('foo\xef\xb5\x90bar',
628 None),
629 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
630 ('foo\xef\xb9\xb6bar',
631 'foo \xd9\x8ebar'),
632 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
633 ('\xd8\xa71',
634 None),
635 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
636 ('\xd8\xa71\xd8\xa8',
637 '\xd8\xa71\xd8\xa8'),
638 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000639 # Skip this test as we allow unassigned
640 #('\xf3\xa0\x80\x82',
641 # None),
642 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000643 # 3.44 Larger test (shrinking).
644 # Original test case reads \xc3\xdf
645 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
646 '\xaa\xce\xb0\xe2\x80\x80',
647 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
648 # 3.45 Larger test (expanding).
649 # Original test case reads \xc3\x9f
650 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
651 '\x80',
652 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
653 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
654 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
655 ]
656
657
658class NameprepTest(unittest.TestCase):
659 def test_nameprep(self):
660 from encodings.idna import nameprep
661 for pos, (orig, prepped) in enumerate(nameprep_tests):
662 if orig is None:
663 # Skipped
664 continue
665 # The Unicode strings are given in UTF-8
666 orig = unicode(orig, "utf-8")
667 if prepped is None:
668 # Input contains prohibited characters
669 self.assertRaises(UnicodeError, nameprep, orig)
670 else:
671 prepped = unicode(prepped, "utf-8")
672 try:
673 self.assertEquals(nameprep(orig), prepped)
674 except Exception,e:
675 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
676
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000677class CodecTest(unittest.TestCase):
678 def test_builtin(self):
679 self.assertEquals(unicode("python.org", "idna"), u"python.org")
680
Martin v. Löwis8b595142005-08-25 11:03:38 +0000681 def test_stream(self):
682 import StringIO
683 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
684 r.read(3)
685 self.assertEquals(r.read(), u"")
686
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000687class CodecsModuleTest(unittest.TestCase):
688
689 def test_decode(self):
690 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
691 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000692 self.assertRaises(TypeError, codecs.decode)
693 self.assertEquals(codecs.decode('abc'), u'abc')
694 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
695
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000696 def test_encode(self):
697 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
698 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000699 self.assertRaises(TypeError, codecs.encode)
700 self.assertEquals(codecs.encode(u'abc'), 'abc')
701 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
702
703 def test_register(self):
704 self.assertRaises(TypeError, codecs.register)
705
706 def test_lookup(self):
707 self.assertRaises(TypeError, codecs.lookup)
708 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000709
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000710class StreamReaderTest(unittest.TestCase):
711
712 def setUp(self):
713 self.reader = codecs.getreader('utf-8')
714 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
715
716 def test_readlines(self):
717 f = self.reader(self.stream)
718 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
719
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000720class Str2StrTest(unittest.TestCase):
721
722 def test_read(self):
723 sin = "\x80".encode("base64_codec")
724 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
725 sout = reader.read()
726 self.assertEqual(sout, "\x80")
727 self.assert_(isinstance(sout, str))
728
729 def test_readline(self):
730 sin = "\x80".encode("base64_codec")
731 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
732 sout = reader.readline()
733 self.assertEqual(sout, "\x80")
734 self.assert_(isinstance(sout, str))
735
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000736all_unicode_encodings = [
737 "ascii",
738 "base64_codec",
739 "big5",
740 "big5hkscs",
741 "charmap",
742 "cp037",
743 "cp1006",
744 "cp1026",
745 "cp1140",
746 "cp1250",
747 "cp1251",
748 "cp1252",
749 "cp1253",
750 "cp1254",
751 "cp1255",
752 "cp1256",
753 "cp1257",
754 "cp1258",
755 "cp424",
756 "cp437",
757 "cp500",
758 "cp737",
759 "cp775",
760 "cp850",
761 "cp852",
762 "cp855",
763 "cp856",
764 "cp857",
765 "cp860",
766 "cp861",
767 "cp862",
768 "cp863",
769 "cp864",
770 "cp865",
771 "cp866",
772 "cp869",
773 "cp874",
774 "cp875",
775 "cp932",
776 "cp949",
777 "cp950",
778 "euc_jis_2004",
779 "euc_jisx0213",
780 "euc_jp",
781 "euc_kr",
782 "gb18030",
783 "gb2312",
784 "gbk",
785 "hex_codec",
786 "hp_roman8",
787 "hz",
788 "idna",
789 "iso2022_jp",
790 "iso2022_jp_1",
791 "iso2022_jp_2",
792 "iso2022_jp_2004",
793 "iso2022_jp_3",
794 "iso2022_jp_ext",
795 "iso2022_kr",
796 "iso8859_1",
797 "iso8859_10",
798 "iso8859_11",
799 "iso8859_13",
800 "iso8859_14",
801 "iso8859_15",
802 "iso8859_16",
803 "iso8859_2",
804 "iso8859_3",
805 "iso8859_4",
806 "iso8859_5",
807 "iso8859_6",
808 "iso8859_7",
809 "iso8859_8",
810 "iso8859_9",
811 "johab",
812 "koi8_r",
813 "koi8_u",
814 "latin_1",
815 "mac_cyrillic",
816 "mac_greek",
817 "mac_iceland",
818 "mac_latin2",
819 "mac_roman",
820 "mac_turkish",
821 "palmos",
822 "ptcp154",
823 "punycode",
824 "raw_unicode_escape",
825 "rot_13",
826 "shift_jis",
827 "shift_jis_2004",
828 "shift_jisx0213",
829 "tis_620",
830 "unicode_escape",
831 "unicode_internal",
832 "utf_16",
833 "utf_16_be",
834 "utf_16_le",
835 "utf_7",
836 "utf_8",
837]
838
839if hasattr(codecs, "mbcs_encode"):
840 all_unicode_encodings.append("mbcs")
841
842# The following encodings work only with str, not unicode
843all_string_encodings = [
844 "quopri_codec",
845 "string_escape",
846 "uu_codec",
847]
848
849# The following encoding is not tested, because it's not supposed
850# to work:
851# "undefined"
852
853# The following encodings don't work in stateful mode
854broken_unicode_with_streams = [
855 "base64_codec",
856 "hex_codec",
857 "punycode",
858 "unicode_internal"
859]
860
861try:
862 import bz2
863except ImportError:
864 pass
865else:
866 all_unicode_encodings.append("bz2_codec")
867 broken_unicode_with_streams.append("bz2_codec")
868
869try:
870 import zlib
871except ImportError:
872 pass
873else:
874 all_unicode_encodings.append("zlib_codec")
875 broken_unicode_with_streams.append("zlib_codec")
876
877class BasicUnicodeTest(unittest.TestCase):
878 def test_basics(self):
879 s = u"abc123" # all codecs should be able to encode these
880 for encoding in all_unicode_encodings:
881 (bytes, size) = codecs.getencoder(encoding)(s)
882 if encoding != "unicode_internal":
883 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
884 (chars, size) = codecs.getdecoder(encoding)(bytes)
885 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
886
887 if encoding not in broken_unicode_with_streams:
888 # check stream reader/writer
889 q = Queue()
890 writer = codecs.getwriter(encoding)(q)
891 encodedresult = ""
892 for c in s:
893 writer.write(c)
894 encodedresult += q.read()
895 q = Queue()
896 reader = codecs.getreader(encoding)(q)
897 decodedresult = u""
898 for c in encodedresult:
899 q.write(c)
900 decodedresult += reader.read()
901 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
902
Walter Dörwald729c31f2005-03-14 19:06:30 +0000903 def test_seek(self):
904 # all codecs should be able to encode these
905 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
906 for encoding in all_unicode_encodings:
907 if encoding == "idna": # FIXME: See SF bug #1163178
908 continue
909 if encoding in broken_unicode_with_streams:
910 continue
911 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
912 for t in xrange(5):
913 # Test that calling seek resets the internal codec state and buffers
914 reader.seek(0, 0)
915 line = reader.readline()
916 self.assertEqual(s[:len(line)], line)
917
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000918class BasicStrTest(unittest.TestCase):
919 def test_basics(self):
920 s = "abc123"
921 for encoding in all_string_encodings:
922 (bytes, size) = codecs.getencoder(encoding)(s)
923 self.assertEqual(size, len(s))
924 (chars, size) = codecs.getdecoder(encoding)(bytes)
925 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
926
Walter Dörwaldd1c1e102005-10-06 20:29:57 +0000927class CharmapTest(unittest.TestCase):
928 def test_decode_with_string_map(self):
929 self.assertEquals(
930 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
931 (u"abc", 3)
932 )
933
934 self.assertEquals(
935 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
936 (u"ab\ufffd", 3)
937 )
938
939 self.assertEquals(
940 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
941 (u"ab\ufffd", 3)
942 )
943
944 self.assertEquals(
945 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
946 (u"ab", 3)
947 )
948
949 self.assertEquals(
950 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
951 (u"ab", 3)
952 )
953
954 allbytes = "".join(chr(i) for i in xrange(256))
955 self.assertEquals(
956 codecs.charmap_decode(allbytes, "ignore", u""),
957 (u"", len(allbytes))
958 )
959
960
Fred Drake2e2be372001-09-20 21:33:42 +0000961def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000962 test_support.run_unittest(
963 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000964 UTF16LETest,
965 UTF16BETest,
966 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000967 EscapeDecodeTest,
968 RecodingTest,
969 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000970 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000971 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000972 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000973 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000974 StreamReaderTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000975 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000976 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +0000977 BasicStrTest,
978 CharmapTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000979 )
Fred Drake2e2be372001-09-20 21:33:42 +0000980
981
982if __name__ == "__main__":
983 test_main()