blob: 5f799e0515404b6f082359da10c6c8d362a04864 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +000076 s = 10*(size*u"a" + lineend + u"xxx\n")
77 reader = getreader(s)
78 for i in xrange(10):
79 self.assertEqual(
80 reader.readline(keepends=True),
81 size*u"a" + lineend,
82 )
83 reader = getreader(s)
84 for i in xrange(10):
85 self.assertEqual(
86 reader.readline(keepends=False),
87 size*u"a",
88 )
89
90 def test_bug1175396(self):
91 s = [
92 '<%!--===================================================\r\n',
93 ' BLOG index page: show recent articles,\r\n',
94 ' today\'s articles, or articles of a specific date.\r\n',
95 '========================================================--%>\r\n',
96 '<%@inputencoding="ISO-8859-1"%>\r\n',
97 '<%@pagetemplate=TEMPLATE.y%>\r\n',
98 '<%@import=import frog.util, frog%>\r\n',
99 '<%@import=import frog.objects%>\r\n',
100 '<%@import=from frog.storageerrors import StorageError%>\r\n',
101 '<%\r\n',
102 '\r\n',
103 'import logging\r\n',
104 'log=logging.getLogger("Snakelets.logger")\r\n',
105 '\r\n',
106 '\r\n',
107 'user=self.SessionCtx.user\r\n',
108 'storageEngine=self.SessionCtx.storageEngine\r\n',
109 '\r\n',
110 '\r\n',
111 'def readArticlesFromDate(date, count=None):\r\n',
112 ' entryids=storageEngine.listBlogEntries(date)\r\n',
113 ' entryids.reverse() # descending\r\n',
114 ' if count:\r\n',
115 ' entryids=entryids[:count]\r\n',
116 ' try:\r\n',
117 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
118 ' except StorageError,x:\r\n',
119 ' log.error("Error loading articles: "+str(x))\r\n',
120 ' self.abort("cannot load articles")\r\n',
121 '\r\n',
122 'showdate=None\r\n',
123 '\r\n',
124 'arg=self.Request.getArg()\r\n',
125 'if arg=="today":\r\n',
126 ' #-------------------- TODAY\'S ARTICLES\r\n',
127 ' self.write("<h2>Today\'s articles</h2>")\r\n',
128 ' showdate = frog.util.isodatestr() \r\n',
129 ' entries = readArticlesFromDate(showdate)\r\n',
130 'elif arg=="active":\r\n',
131 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
132 ' self.Yredirect("active.y")\r\n',
133 'elif arg=="login":\r\n',
134 ' #-------------------- LOGIN PAGE redirect\r\n',
135 ' self.Yredirect("login.y")\r\n',
136 'elif arg=="date":\r\n',
137 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
138 ' showdate = self.Request.getParameter("date")\r\n',
139 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
140 ' entries = readArticlesFromDate(showdate)\r\n',
141 'else:\r\n',
142 ' #-------------------- RECENT ARTICLES\r\n',
143 ' self.write("<h2>Recent articles</h2>")\r\n',
144 ' dates=storageEngine.listBlogEntryDates()\r\n',
145 ' if dates:\r\n',
146 ' entries=[]\r\n',
147 ' SHOWAMOUNT=10\r\n',
148 ' for showdate in dates:\r\n',
149 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
150 ' if len(entries)>=SHOWAMOUNT:\r\n',
151 ' break\r\n',
152 ' \r\n',
153 ]
154 stream = StringIO.StringIO("".join(s).encode(self.encoding))
155 reader = codecs.getreader(self.encoding)(stream)
156 for (i, line) in enumerate(reader):
157 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000158
159 def test_readlinequeue(self):
160 q = Queue()
161 writer = codecs.getwriter(self.encoding)(q)
162 reader = codecs.getreader(self.encoding)(q)
163
164 # No lineends
165 writer.write(u"foo\r")
166 self.assertEqual(reader.readline(keepends=False), u"foo")
167 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000168 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000169 self.assertEqual(reader.readline(keepends=False), u"bar")
170 writer.write(u"baz")
171 self.assertEqual(reader.readline(keepends=False), u"baz")
172 self.assertEqual(reader.readline(keepends=False), u"")
173
174 # Lineends
175 writer.write(u"foo\r")
176 self.assertEqual(reader.readline(keepends=True), u"foo\r")
177 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000178 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000179 self.assertEqual(reader.readline(keepends=True), u"bar\r")
180 writer.write(u"baz")
181 self.assertEqual(reader.readline(keepends=True), u"baz")
182 self.assertEqual(reader.readline(keepends=True), u"")
183 writer.write(u"foo\r\n")
184 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
185
Walter Dörwald9fa09462005-01-10 12:01:39 +0000186 def test_bug1098990_a(self):
187 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
188 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
189 s3 = u"next line.\r\n"
190
191 s = (s1+s2+s3).encode(self.encoding)
192 stream = StringIO.StringIO(s)
193 reader = codecs.getreader(self.encoding)(stream)
194 self.assertEqual(reader.readline(), s1)
195 self.assertEqual(reader.readline(), s2)
196 self.assertEqual(reader.readline(), s3)
197 self.assertEqual(reader.readline(), u"")
198
199 def test_bug1098990_b(self):
200 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
201 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
202 s3 = u"stillokay:bbbbxx\r\n"
203 s4 = u"broken!!!!badbad\r\n"
204 s5 = u"againokay.\r\n"
205
206 s = (s1+s2+s3+s4+s5).encode(self.encoding)
207 stream = StringIO.StringIO(s)
208 reader = codecs.getreader(self.encoding)(stream)
209 self.assertEqual(reader.readline(), s1)
210 self.assertEqual(reader.readline(), s2)
211 self.assertEqual(reader.readline(), s3)
212 self.assertEqual(reader.readline(), s4)
213 self.assertEqual(reader.readline(), s5)
214 self.assertEqual(reader.readline(), u"")
215
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216class UTF16Test(ReadTest):
217 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000218
219 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
220 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
221
222 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000223 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000224 # encode some stream
225 s = StringIO.StringIO()
226 f = writer(s)
227 f.write(u"spam")
228 f.write(u"spam")
229 d = s.getvalue()
230 # check whether there is exactly one BOM in it
231 self.assert_(d == self.spamle or d == self.spambe)
232 # try to read it back
233 s = StringIO.StringIO(d)
234 f = reader(s)
235 self.assertEquals(f.read(), u"spamspam")
236
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000237 def test_badbom(self):
238 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000239 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000240 self.assertRaises(UnicodeError, f.read)
241
242 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000243 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000244 self.assertRaises(UnicodeError, f.read)
245
Walter Dörwald69652032004-09-07 20:24:22 +0000246 def test_partial(self):
247 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000248 u"\x00\xff\u0100\uffff",
249 [
250 u"", # first byte of BOM read
251 u"", # second byte of BOM read => byteorder known
252 u"",
253 u"\x00",
254 u"\x00",
255 u"\x00\xff",
256 u"\x00\xff",
257 u"\x00\xff\u0100",
258 u"\x00\xff\u0100",
259 u"\x00\xff\u0100\uffff",
260 ]
261 )
262
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000263class UTF16LETest(ReadTest):
264 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000265
266 def test_partial(self):
267 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000268 u"\x00\xff\u0100\uffff",
269 [
270 u"",
271 u"\x00",
272 u"\x00",
273 u"\x00\xff",
274 u"\x00\xff",
275 u"\x00\xff\u0100",
276 u"\x00\xff\u0100",
277 u"\x00\xff\u0100\uffff",
278 ]
279 )
280
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281class UTF16BETest(ReadTest):
282 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000283
284 def test_partial(self):
285 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000286 u"\x00\xff\u0100\uffff",
287 [
288 u"",
289 u"\x00",
290 u"\x00",
291 u"\x00\xff",
292 u"\x00\xff",
293 u"\x00\xff\u0100",
294 u"\x00\xff\u0100",
295 u"\x00\xff\u0100\uffff",
296 ]
297 )
298
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299class UTF8Test(ReadTest):
300 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000301
302 def test_partial(self):
303 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000304 u"\x00\xff\u07ff\u0800\uffff",
305 [
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u07ff",
311 u"\x00\xff\u07ff",
312 u"\x00\xff\u07ff",
313 u"\x00\xff\u07ff\u0800",
314 u"\x00\xff\u07ff\u0800",
315 u"\x00\xff\u07ff\u0800",
316 u"\x00\xff\u07ff\u0800\uffff",
317 ]
318 )
319
Walter Dörwald8709a422002-09-03 13:53:40 +0000320class EscapeDecodeTest(unittest.TestCase):
321 def test_empty_escape_decode(self):
322 self.assertEquals(codecs.escape_decode(""), ("", 0))
323
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000324class RecodingTest(unittest.TestCase):
325 def test_recoding(self):
326 f = StringIO.StringIO()
327 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
328 f2.write(u"a")
329 f2.close()
330 # Python used to crash on this at exit because of a refcount
331 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000332
Martin v. Löwis2548c732003-04-18 10:39:54 +0000333# From RFC 3492
334punycode_testcases = [
335 # A Arabic (Egyptian):
336 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
337 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
338 "egbpdaj6bu4bxfgehfvwxn"),
339 # B Chinese (simplified):
340 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
341 "ihqwcrb4cv8a8dqg056pqjye"),
342 # C Chinese (traditional):
343 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
344 "ihqwctvzc91f659drss3x8bo0yb"),
345 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
346 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
347 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
348 u"\u0065\u0073\u006B\u0079",
349 "Proprostnemluvesky-uyb24dma41a"),
350 # E Hebrew:
351 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
352 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
353 u"\u05D1\u05E8\u05D9\u05EA",
354 "4dbcagdahymbxekheh6e0a7fei0b"),
355 # F Hindi (Devanagari):
356 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
357 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
358 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
359 u"\u0939\u0948\u0902",
360 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
361
362 #(G) Japanese (kanji and hiragana):
363 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
364 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
365 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
366
367 # (H) Korean (Hangul syllables):
368 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
369 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
370 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
371 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
372 "psd879ccm6fea98c"),
373
374 # (I) Russian (Cyrillic):
375 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
376 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
377 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
378 u"\u0438",
379 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
380
381 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
382 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
383 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
384 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
385 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
386 u"\u0061\u00F1\u006F\u006C",
387 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
388
389 # (K) Vietnamese:
390 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
391 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
392 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
393 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
394 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
395 u"\u0056\u0069\u1EC7\u0074",
396 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
397
398
399 #(L) 3<nen>B<gumi><kinpachi><sensei>
400 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
401 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000402
Martin v. Löwis2548c732003-04-18 10:39:54 +0000403 # (M) <amuro><namie>-with-SUPER-MONKEYS
404 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
405 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
406 u"\u004F\u004E\u004B\u0045\u0059\u0053",
407 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
408
409 # (N) Hello-Another-Way-<sorezore><no><basho>
410 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
411 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
412 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
413 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
414
415 # (O) <hitotsu><yane><no><shita>2
416 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
417 "2-u9tlzr9756bt3uc0v"),
418
419 # (P) Maji<de>Koi<suru>5<byou><mae>
420 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
421 u"\u308B\u0035\u79D2\u524D",
422 "MajiKoi5-783gue6qz075azm5e"),
423
424 # (Q) <pafii>de<runba>
425 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
426 "de-jg4avhby1noc0d"),
427
428 # (R) <sono><supiido><de>
429 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
430 "d9juau41awczczp"),
431
432 # (S) -> $1.00 <-
433 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
434 u"\u003C\u002D",
435 "-> $1.00 <--")
436 ]
437
438for i in punycode_testcases:
439 if len(i)!=2:
440 print repr(i)
441
442class PunycodeTest(unittest.TestCase):
443 def test_encode(self):
444 for uni, puny in punycode_testcases:
445 # Need to convert both strings to lower case, since
446 # some of the extended encodings use upper case, but our
447 # code produces only lower case. Converting just puny to
448 # lower is also insufficient, since some of the input characters
449 # are upper case.
450 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
451
452 def test_decode(self):
453 for uni, puny in punycode_testcases:
454 self.assertEquals(uni, puny.decode("punycode"))
455
456# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
457nameprep_tests = [
458 # 3.1 Map to nothing.
459 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
460 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
461 '\xb8\x8f\xef\xbb\xbf',
462 'foobarbaz'),
463 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
464 ('CAFE',
465 'cafe'),
466 # 3.3 Case folding 8bit U+00DF (german sharp s).
467 # The original test case is bogus; it says \xc3\xdf
468 ('\xc3\x9f',
469 'ss'),
470 # 3.4 Case folding U+0130 (turkish capital I with dot).
471 ('\xc4\xb0',
472 'i\xcc\x87'),
473 # 3.5 Case folding multibyte U+0143 U+037A.
474 ('\xc5\x83\xcd\xba',
475 '\xc5\x84 \xce\xb9'),
476 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
477 # XXX: skip this as it fails in UCS-2 mode
478 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
479 # 'telc\xe2\x88\x95kg\xcf\x83'),
480 (None, None),
481 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
482 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
483 '\xc7\xb0 a'),
484 # 3.8 Case folding U+1FB7 and normalization.
485 ('\xe1\xbe\xb7',
486 '\xe1\xbe\xb6\xce\xb9'),
487 # 3.9 Self-reverting case folding U+01F0 and normalization.
488 # The original test case is bogus, it says `\xc7\xf0'
489 ('\xc7\xb0',
490 '\xc7\xb0'),
491 # 3.10 Self-reverting case folding U+0390 and normalization.
492 ('\xce\x90',
493 '\xce\x90'),
494 # 3.11 Self-reverting case folding U+03B0 and normalization.
495 ('\xce\xb0',
496 '\xce\xb0'),
497 # 3.12 Self-reverting case folding U+1E96 and normalization.
498 ('\xe1\xba\x96',
499 '\xe1\xba\x96'),
500 # 3.13 Self-reverting case folding U+1F56 and normalization.
501 ('\xe1\xbd\x96',
502 '\xe1\xbd\x96'),
503 # 3.14 ASCII space character U+0020.
504 (' ',
505 ' '),
506 # 3.15 Non-ASCII 8bit space character U+00A0.
507 ('\xc2\xa0',
508 ' '),
509 # 3.16 Non-ASCII multibyte space character U+1680.
510 ('\xe1\x9a\x80',
511 None),
512 # 3.17 Non-ASCII multibyte space character U+2000.
513 ('\xe2\x80\x80',
514 ' '),
515 # 3.18 Zero Width Space U+200b.
516 ('\xe2\x80\x8b',
517 ''),
518 # 3.19 Non-ASCII multibyte space character U+3000.
519 ('\xe3\x80\x80',
520 ' '),
521 # 3.20 ASCII control characters U+0010 U+007F.
522 ('\x10\x7f',
523 '\x10\x7f'),
524 # 3.21 Non-ASCII 8bit control character U+0085.
525 ('\xc2\x85',
526 None),
527 # 3.22 Non-ASCII multibyte control character U+180E.
528 ('\xe1\xa0\x8e',
529 None),
530 # 3.23 Zero Width No-Break Space U+FEFF.
531 ('\xef\xbb\xbf',
532 ''),
533 # 3.24 Non-ASCII control character U+1D175.
534 ('\xf0\x9d\x85\xb5',
535 None),
536 # 3.25 Plane 0 private use character U+F123.
537 ('\xef\x84\xa3',
538 None),
539 # 3.26 Plane 15 private use character U+F1234.
540 ('\xf3\xb1\x88\xb4',
541 None),
542 # 3.27 Plane 16 private use character U+10F234.
543 ('\xf4\x8f\x88\xb4',
544 None),
545 # 3.28 Non-character code point U+8FFFE.
546 ('\xf2\x8f\xbf\xbe',
547 None),
548 # 3.29 Non-character code point U+10FFFF.
549 ('\xf4\x8f\xbf\xbf',
550 None),
551 # 3.30 Surrogate code U+DF42.
552 ('\xed\xbd\x82',
553 None),
554 # 3.31 Non-plain text character U+FFFD.
555 ('\xef\xbf\xbd',
556 None),
557 # 3.32 Ideographic description character U+2FF5.
558 ('\xe2\xbf\xb5',
559 None),
560 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000561 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000562 '\xcc\x81'),
563 # 3.34 Left-to-right mark U+200E.
564 ('\xe2\x80\x8e',
565 None),
566 # 3.35 Deprecated U+202A.
567 ('\xe2\x80\xaa',
568 None),
569 # 3.36 Language tagging character U+E0001.
570 ('\xf3\xa0\x80\x81',
571 None),
572 # 3.37 Language tagging character U+E0042.
573 ('\xf3\xa0\x81\x82',
574 None),
575 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
576 ('foo\xd6\xbebar',
577 None),
578 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
579 ('foo\xef\xb5\x90bar',
580 None),
581 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
582 ('foo\xef\xb9\xb6bar',
583 'foo \xd9\x8ebar'),
584 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
585 ('\xd8\xa71',
586 None),
587 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
588 ('\xd8\xa71\xd8\xa8',
589 '\xd8\xa71\xd8\xa8'),
590 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000591 # Skip this test as we allow unassigned
592 #('\xf3\xa0\x80\x82',
593 # None),
594 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000595 # 3.44 Larger test (shrinking).
596 # Original test case reads \xc3\xdf
597 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
598 '\xaa\xce\xb0\xe2\x80\x80',
599 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
600 # 3.45 Larger test (expanding).
601 # Original test case reads \xc3\x9f
602 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
603 '\x80',
604 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
605 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
606 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
607 ]
608
609
610class NameprepTest(unittest.TestCase):
611 def test_nameprep(self):
612 from encodings.idna import nameprep
613 for pos, (orig, prepped) in enumerate(nameprep_tests):
614 if orig is None:
615 # Skipped
616 continue
617 # The Unicode strings are given in UTF-8
618 orig = unicode(orig, "utf-8")
619 if prepped is None:
620 # Input contains prohibited characters
621 self.assertRaises(UnicodeError, nameprep, orig)
622 else:
623 prepped = unicode(prepped, "utf-8")
624 try:
625 self.assertEquals(nameprep(orig), prepped)
626 except Exception,e:
627 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
628
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000629class CodecTest(unittest.TestCase):
630 def test_builtin(self):
631 self.assertEquals(unicode("python.org", "idna"), u"python.org")
632
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000633class CodecsModuleTest(unittest.TestCase):
634
635 def test_decode(self):
636 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
637 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000638 self.assertRaises(TypeError, codecs.decode)
639 self.assertEquals(codecs.decode('abc'), u'abc')
640 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
641
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000642 def test_encode(self):
643 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
644 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000645 self.assertRaises(TypeError, codecs.encode)
646 self.assertEquals(codecs.encode(u'abc'), 'abc')
647 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
648
649 def test_register(self):
650 self.assertRaises(TypeError, codecs.register)
651
652 def test_lookup(self):
653 self.assertRaises(TypeError, codecs.lookup)
654 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000655
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000656class StreamReaderTest(unittest.TestCase):
657
658 def setUp(self):
659 self.reader = codecs.getreader('utf-8')
660 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
661
662 def test_readlines(self):
663 f = self.reader(self.stream)
664 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
665
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000666class Str2StrTest(unittest.TestCase):
667
668 def test_read(self):
669 sin = "\x80".encode("base64_codec")
670 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
671 sout = reader.read()
672 self.assertEqual(sout, "\x80")
673 self.assert_(isinstance(sout, str))
674
675 def test_readline(self):
676 sin = "\x80".encode("base64_codec")
677 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
678 sout = reader.readline()
679 self.assertEqual(sout, "\x80")
680 self.assert_(isinstance(sout, str))
681
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000682all_unicode_encodings = [
683 "ascii",
684 "base64_codec",
685 "big5",
686 "big5hkscs",
687 "charmap",
688 "cp037",
689 "cp1006",
690 "cp1026",
691 "cp1140",
692 "cp1250",
693 "cp1251",
694 "cp1252",
695 "cp1253",
696 "cp1254",
697 "cp1255",
698 "cp1256",
699 "cp1257",
700 "cp1258",
701 "cp424",
702 "cp437",
703 "cp500",
704 "cp737",
705 "cp775",
706 "cp850",
707 "cp852",
708 "cp855",
709 "cp856",
710 "cp857",
711 "cp860",
712 "cp861",
713 "cp862",
714 "cp863",
715 "cp864",
716 "cp865",
717 "cp866",
718 "cp869",
719 "cp874",
720 "cp875",
721 "cp932",
722 "cp949",
723 "cp950",
724 "euc_jis_2004",
725 "euc_jisx0213",
726 "euc_jp",
727 "euc_kr",
728 "gb18030",
729 "gb2312",
730 "gbk",
731 "hex_codec",
732 "hp_roman8",
733 "hz",
734 "idna",
735 "iso2022_jp",
736 "iso2022_jp_1",
737 "iso2022_jp_2",
738 "iso2022_jp_2004",
739 "iso2022_jp_3",
740 "iso2022_jp_ext",
741 "iso2022_kr",
742 "iso8859_1",
743 "iso8859_10",
744 "iso8859_11",
745 "iso8859_13",
746 "iso8859_14",
747 "iso8859_15",
748 "iso8859_16",
749 "iso8859_2",
750 "iso8859_3",
751 "iso8859_4",
752 "iso8859_5",
753 "iso8859_6",
754 "iso8859_7",
755 "iso8859_8",
756 "iso8859_9",
757 "johab",
758 "koi8_r",
759 "koi8_u",
760 "latin_1",
761 "mac_cyrillic",
762 "mac_greek",
763 "mac_iceland",
764 "mac_latin2",
765 "mac_roman",
766 "mac_turkish",
767 "palmos",
768 "ptcp154",
769 "punycode",
770 "raw_unicode_escape",
771 "rot_13",
772 "shift_jis",
773 "shift_jis_2004",
774 "shift_jisx0213",
775 "tis_620",
776 "unicode_escape",
777 "unicode_internal",
778 "utf_16",
779 "utf_16_be",
780 "utf_16_le",
781 "utf_7",
782 "utf_8",
783]
784
785if hasattr(codecs, "mbcs_encode"):
786 all_unicode_encodings.append("mbcs")
787
788# The following encodings work only with str, not unicode
789all_string_encodings = [
790 "quopri_codec",
791 "string_escape",
792 "uu_codec",
793]
794
795# The following encoding is not tested, because it's not supposed
796# to work:
797# "undefined"
798
799# The following encodings don't work in stateful mode
800broken_unicode_with_streams = [
801 "base64_codec",
802 "hex_codec",
803 "punycode",
804 "unicode_internal"
805]
806
807try:
808 import bz2
809except ImportError:
810 pass
811else:
812 all_unicode_encodings.append("bz2_codec")
813 broken_unicode_with_streams.append("bz2_codec")
814
815try:
816 import zlib
817except ImportError:
818 pass
819else:
820 all_unicode_encodings.append("zlib_codec")
821 broken_unicode_with_streams.append("zlib_codec")
822
823class BasicUnicodeTest(unittest.TestCase):
824 def test_basics(self):
825 s = u"abc123" # all codecs should be able to encode these
826 for encoding in all_unicode_encodings:
827 (bytes, size) = codecs.getencoder(encoding)(s)
828 if encoding != "unicode_internal":
829 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
830 (chars, size) = codecs.getdecoder(encoding)(bytes)
831 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
832
833 if encoding not in broken_unicode_with_streams:
834 # check stream reader/writer
835 q = Queue()
836 writer = codecs.getwriter(encoding)(q)
837 encodedresult = ""
838 for c in s:
839 writer.write(c)
840 encodedresult += q.read()
841 q = Queue()
842 reader = codecs.getreader(encoding)(q)
843 decodedresult = u""
844 for c in encodedresult:
845 q.write(c)
846 decodedresult += reader.read()
847 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
848
Walter Dörwald729c31f2005-03-14 19:06:30 +0000849 def test_seek(self):
850 # all codecs should be able to encode these
851 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
852 for encoding in all_unicode_encodings:
853 if encoding == "idna": # FIXME: See SF bug #1163178
854 continue
855 if encoding in broken_unicode_with_streams:
856 continue
857 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
858 for t in xrange(5):
859 # Test that calling seek resets the internal codec state and buffers
860 reader.seek(0, 0)
861 line = reader.readline()
862 self.assertEqual(s[:len(line)], line)
863
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000864class BasicStrTest(unittest.TestCase):
865 def test_basics(self):
866 s = "abc123"
867 for encoding in all_string_encodings:
868 (bytes, size) = codecs.getencoder(encoding)(s)
869 self.assertEqual(size, len(s))
870 (chars, size) = codecs.getdecoder(encoding)(bytes)
871 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
872
Fred Drake2e2be372001-09-20 21:33:42 +0000873def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000874 test_support.run_unittest(
875 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000876 UTF16LETest,
877 UTF16BETest,
878 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000879 EscapeDecodeTest,
880 RecodingTest,
881 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000882 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000883 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000884 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000885 StreamReaderTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000886 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000887 BasicUnicodeTest,
888 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000889 )
Fred Drake2e2be372001-09-20 21:33:42 +0000890
891
892if __name__ == "__main__":
893 test_main()