blob: 5189e80b7398183f2d69df30f5d1a8b5573e396f [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +000076 s = 10*(size*u"a" + lineend + u"xxx\n")
77 reader = getreader(s)
78 for i in xrange(10):
79 self.assertEqual(
80 reader.readline(keepends=True),
81 size*u"a" + lineend,
82 )
83 reader = getreader(s)
84 for i in xrange(10):
85 self.assertEqual(
86 reader.readline(keepends=False),
87 size*u"a",
88 )
89
90 def test_bug1175396(self):
91 s = [
92 '<%!--===================================================\r\n',
93 ' BLOG index page: show recent articles,\r\n',
94 ' today\'s articles, or articles of a specific date.\r\n',
95 '========================================================--%>\r\n',
96 '<%@inputencoding="ISO-8859-1"%>\r\n',
97 '<%@pagetemplate=TEMPLATE.y%>\r\n',
98 '<%@import=import frog.util, frog%>\r\n',
99 '<%@import=import frog.objects%>\r\n',
100 '<%@import=from frog.storageerrors import StorageError%>\r\n',
101 '<%\r\n',
102 '\r\n',
103 'import logging\r\n',
104 'log=logging.getLogger("Snakelets.logger")\r\n',
105 '\r\n',
106 '\r\n',
107 'user=self.SessionCtx.user\r\n',
108 'storageEngine=self.SessionCtx.storageEngine\r\n',
109 '\r\n',
110 '\r\n',
111 'def readArticlesFromDate(date, count=None):\r\n',
112 ' entryids=storageEngine.listBlogEntries(date)\r\n',
113 ' entryids.reverse() # descending\r\n',
114 ' if count:\r\n',
115 ' entryids=entryids[:count]\r\n',
116 ' try:\r\n',
117 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
118 ' except StorageError,x:\r\n',
119 ' log.error("Error loading articles: "+str(x))\r\n',
120 ' self.abort("cannot load articles")\r\n',
121 '\r\n',
122 'showdate=None\r\n',
123 '\r\n',
124 'arg=self.Request.getArg()\r\n',
125 'if arg=="today":\r\n',
126 ' #-------------------- TODAY\'S ARTICLES\r\n',
127 ' self.write("<h2>Today\'s articles</h2>")\r\n',
128 ' showdate = frog.util.isodatestr() \r\n',
129 ' entries = readArticlesFromDate(showdate)\r\n',
130 'elif arg=="active":\r\n',
131 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
132 ' self.Yredirect("active.y")\r\n',
133 'elif arg=="login":\r\n',
134 ' #-------------------- LOGIN PAGE redirect\r\n',
135 ' self.Yredirect("login.y")\r\n',
136 'elif arg=="date":\r\n',
137 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
138 ' showdate = self.Request.getParameter("date")\r\n',
139 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
140 ' entries = readArticlesFromDate(showdate)\r\n',
141 'else:\r\n',
142 ' #-------------------- RECENT ARTICLES\r\n',
143 ' self.write("<h2>Recent articles</h2>")\r\n',
144 ' dates=storageEngine.listBlogEntryDates()\r\n',
145 ' if dates:\r\n',
146 ' entries=[]\r\n',
147 ' SHOWAMOUNT=10\r\n',
148 ' for showdate in dates:\r\n',
149 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
150 ' if len(entries)>=SHOWAMOUNT:\r\n',
151 ' break\r\n',
152 ' \r\n',
153 ]
154 stream = StringIO.StringIO("".join(s).encode(self.encoding))
155 reader = codecs.getreader(self.encoding)(stream)
156 for (i, line) in enumerate(reader):
157 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000158
159 def test_readlinequeue(self):
160 q = Queue()
161 writer = codecs.getwriter(self.encoding)(q)
162 reader = codecs.getreader(self.encoding)(q)
163
164 # No lineends
165 writer.write(u"foo\r")
166 self.assertEqual(reader.readline(keepends=False), u"foo")
167 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000168 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000169 self.assertEqual(reader.readline(keepends=False), u"bar")
170 writer.write(u"baz")
171 self.assertEqual(reader.readline(keepends=False), u"baz")
172 self.assertEqual(reader.readline(keepends=False), u"")
173
174 # Lineends
175 writer.write(u"foo\r")
176 self.assertEqual(reader.readline(keepends=True), u"foo\r")
177 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000178 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000179 self.assertEqual(reader.readline(keepends=True), u"bar\r")
180 writer.write(u"baz")
181 self.assertEqual(reader.readline(keepends=True), u"baz")
182 self.assertEqual(reader.readline(keepends=True), u"")
183 writer.write(u"foo\r\n")
184 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
185
Walter Dörwald9fa09462005-01-10 12:01:39 +0000186 def test_bug1098990_a(self):
187 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
188 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
189 s3 = u"next line.\r\n"
190
191 s = (s1+s2+s3).encode(self.encoding)
192 stream = StringIO.StringIO(s)
193 reader = codecs.getreader(self.encoding)(stream)
194 self.assertEqual(reader.readline(), s1)
195 self.assertEqual(reader.readline(), s2)
196 self.assertEqual(reader.readline(), s3)
197 self.assertEqual(reader.readline(), u"")
198
199 def test_bug1098990_b(self):
200 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
201 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
202 s3 = u"stillokay:bbbbxx\r\n"
203 s4 = u"broken!!!!badbad\r\n"
204 s5 = u"againokay.\r\n"
205
206 s = (s1+s2+s3+s4+s5).encode(self.encoding)
207 stream = StringIO.StringIO(s)
208 reader = codecs.getreader(self.encoding)(stream)
209 self.assertEqual(reader.readline(), s1)
210 self.assertEqual(reader.readline(), s2)
211 self.assertEqual(reader.readline(), s3)
212 self.assertEqual(reader.readline(), s4)
213 self.assertEqual(reader.readline(), s5)
214 self.assertEqual(reader.readline(), u"")
215
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000216class UTF16Test(ReadTest):
217 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000218
219 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
220 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
221
222 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000223 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000224 # encode some stream
225 s = StringIO.StringIO()
226 f = writer(s)
227 f.write(u"spam")
228 f.write(u"spam")
229 d = s.getvalue()
230 # check whether there is exactly one BOM in it
231 self.assert_(d == self.spamle or d == self.spambe)
232 # try to read it back
233 s = StringIO.StringIO(d)
234 f = reader(s)
235 self.assertEquals(f.read(), u"spamspam")
236
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000237 def test_badbom(self):
238 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000239 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000240 self.assertRaises(UnicodeError, f.read)
241
242 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000243 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000244 self.assertRaises(UnicodeError, f.read)
245
Walter Dörwald69652032004-09-07 20:24:22 +0000246 def test_partial(self):
247 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000248 u"\x00\xff\u0100\uffff",
249 [
250 u"", # first byte of BOM read
251 u"", # second byte of BOM read => byteorder known
252 u"",
253 u"\x00",
254 u"\x00",
255 u"\x00\xff",
256 u"\x00\xff",
257 u"\x00\xff\u0100",
258 u"\x00\xff\u0100",
259 u"\x00\xff\u0100\uffff",
260 ]
261 )
262
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000263class UTF16LETest(ReadTest):
264 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000265
266 def test_partial(self):
267 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000268 u"\x00\xff\u0100\uffff",
269 [
270 u"",
271 u"\x00",
272 u"\x00",
273 u"\x00\xff",
274 u"\x00\xff",
275 u"\x00\xff\u0100",
276 u"\x00\xff\u0100",
277 u"\x00\xff\u0100\uffff",
278 ]
279 )
280
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281class UTF16BETest(ReadTest):
282 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000283
284 def test_partial(self):
285 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000286 u"\x00\xff\u0100\uffff",
287 [
288 u"",
289 u"\x00",
290 u"\x00",
291 u"\x00\xff",
292 u"\x00\xff",
293 u"\x00\xff\u0100",
294 u"\x00\xff\u0100",
295 u"\x00\xff\u0100\uffff",
296 ]
297 )
298
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299class UTF8Test(ReadTest):
300 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000301
302 def test_partial(self):
303 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000304 u"\x00\xff\u07ff\u0800\uffff",
305 [
306 u"\x00",
307 u"\x00",
308 u"\x00\xff",
309 u"\x00\xff",
310 u"\x00\xff\u07ff",
311 u"\x00\xff\u07ff",
312 u"\x00\xff\u07ff",
313 u"\x00\xff\u07ff\u0800",
314 u"\x00\xff\u07ff\u0800",
315 u"\x00\xff\u07ff\u0800",
316 u"\x00\xff\u07ff\u0800\uffff",
317 ]
318 )
319
Walter Dörwald8709a422002-09-03 13:53:40 +0000320class EscapeDecodeTest(unittest.TestCase):
321 def test_empty_escape_decode(self):
322 self.assertEquals(codecs.escape_decode(""), ("", 0))
323
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000324class RecodingTest(unittest.TestCase):
325 def test_recoding(self):
326 f = StringIO.StringIO()
327 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
328 f2.write(u"a")
329 f2.close()
330 # Python used to crash on this at exit because of a refcount
331 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000332
Martin v. Löwis2548c732003-04-18 10:39:54 +0000333# From RFC 3492
334punycode_testcases = [
335 # A Arabic (Egyptian):
336 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
337 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
338 "egbpdaj6bu4bxfgehfvwxn"),
339 # B Chinese (simplified):
340 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
341 "ihqwcrb4cv8a8dqg056pqjye"),
342 # C Chinese (traditional):
343 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
344 "ihqwctvzc91f659drss3x8bo0yb"),
345 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
346 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
347 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
348 u"\u0065\u0073\u006B\u0079",
349 "Proprostnemluvesky-uyb24dma41a"),
350 # E Hebrew:
351 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
352 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
353 u"\u05D1\u05E8\u05D9\u05EA",
354 "4dbcagdahymbxekheh6e0a7fei0b"),
355 # F Hindi (Devanagari):
356 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
357 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
358 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
359 u"\u0939\u0948\u0902",
360 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
361
362 #(G) Japanese (kanji and hiragana):
363 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
364 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
365 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
366
367 # (H) Korean (Hangul syllables):
368 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
369 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
370 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
371 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
372 "psd879ccm6fea98c"),
373
374 # (I) Russian (Cyrillic):
375 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
376 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
377 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
378 u"\u0438",
379 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
380
381 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
382 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
383 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
384 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
385 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
386 u"\u0061\u00F1\u006F\u006C",
387 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
388
389 # (K) Vietnamese:
390 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
391 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
392 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
393 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
394 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
395 u"\u0056\u0069\u1EC7\u0074",
396 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
397
398
399 #(L) 3<nen>B<gumi><kinpachi><sensei>
400 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
401 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000402
Martin v. Löwis2548c732003-04-18 10:39:54 +0000403 # (M) <amuro><namie>-with-SUPER-MONKEYS
404 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
405 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
406 u"\u004F\u004E\u004B\u0045\u0059\u0053",
407 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
408
409 # (N) Hello-Another-Way-<sorezore><no><basho>
410 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
411 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
412 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
413 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
414
415 # (O) <hitotsu><yane><no><shita>2
416 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
417 "2-u9tlzr9756bt3uc0v"),
418
419 # (P) Maji<de>Koi<suru>5<byou><mae>
420 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
421 u"\u308B\u0035\u79D2\u524D",
422 "MajiKoi5-783gue6qz075azm5e"),
423
424 # (Q) <pafii>de<runba>
425 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
426 "de-jg4avhby1noc0d"),
427
428 # (R) <sono><supiido><de>
429 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
430 "d9juau41awczczp"),
431
432 # (S) -> $1.00 <-
433 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
434 u"\u003C\u002D",
435 "-> $1.00 <--")
436 ]
437
438for i in punycode_testcases:
439 if len(i)!=2:
440 print repr(i)
441
442class PunycodeTest(unittest.TestCase):
443 def test_encode(self):
444 for uni, puny in punycode_testcases:
445 # Need to convert both strings to lower case, since
446 # some of the extended encodings use upper case, but our
447 # code produces only lower case. Converting just puny to
448 # lower is also insufficient, since some of the input characters
449 # are upper case.
450 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
451
452 def test_decode(self):
453 for uni, puny in punycode_testcases:
454 self.assertEquals(uni, puny.decode("punycode"))
455
456# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
457nameprep_tests = [
458 # 3.1 Map to nothing.
459 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
460 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
461 '\xb8\x8f\xef\xbb\xbf',
462 'foobarbaz'),
463 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
464 ('CAFE',
465 'cafe'),
466 # 3.3 Case folding 8bit U+00DF (german sharp s).
467 # The original test case is bogus; it says \xc3\xdf
468 ('\xc3\x9f',
469 'ss'),
470 # 3.4 Case folding U+0130 (turkish capital I with dot).
471 ('\xc4\xb0',
472 'i\xcc\x87'),
473 # 3.5 Case folding multibyte U+0143 U+037A.
474 ('\xc5\x83\xcd\xba',
475 '\xc5\x84 \xce\xb9'),
476 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
477 # XXX: skip this as it fails in UCS-2 mode
478 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
479 # 'telc\xe2\x88\x95kg\xcf\x83'),
480 (None, None),
481 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
482 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
483 '\xc7\xb0 a'),
484 # 3.8 Case folding U+1FB7 and normalization.
485 ('\xe1\xbe\xb7',
486 '\xe1\xbe\xb6\xce\xb9'),
487 # 3.9 Self-reverting case folding U+01F0 and normalization.
488 # The original test case is bogus, it says `\xc7\xf0'
489 ('\xc7\xb0',
490 '\xc7\xb0'),
491 # 3.10 Self-reverting case folding U+0390 and normalization.
492 ('\xce\x90',
493 '\xce\x90'),
494 # 3.11 Self-reverting case folding U+03B0 and normalization.
495 ('\xce\xb0',
496 '\xce\xb0'),
497 # 3.12 Self-reverting case folding U+1E96 and normalization.
498 ('\xe1\xba\x96',
499 '\xe1\xba\x96'),
500 # 3.13 Self-reverting case folding U+1F56 and normalization.
501 ('\xe1\xbd\x96',
502 '\xe1\xbd\x96'),
503 # 3.14 ASCII space character U+0020.
504 (' ',
505 ' '),
506 # 3.15 Non-ASCII 8bit space character U+00A0.
507 ('\xc2\xa0',
508 ' '),
509 # 3.16 Non-ASCII multibyte space character U+1680.
510 ('\xe1\x9a\x80',
511 None),
512 # 3.17 Non-ASCII multibyte space character U+2000.
513 ('\xe2\x80\x80',
514 ' '),
515 # 3.18 Zero Width Space U+200b.
516 ('\xe2\x80\x8b',
517 ''),
518 # 3.19 Non-ASCII multibyte space character U+3000.
519 ('\xe3\x80\x80',
520 ' '),
521 # 3.20 ASCII control characters U+0010 U+007F.
522 ('\x10\x7f',
523 '\x10\x7f'),
524 # 3.21 Non-ASCII 8bit control character U+0085.
525 ('\xc2\x85',
526 None),
527 # 3.22 Non-ASCII multibyte control character U+180E.
528 ('\xe1\xa0\x8e',
529 None),
530 # 3.23 Zero Width No-Break Space U+FEFF.
531 ('\xef\xbb\xbf',
532 ''),
533 # 3.24 Non-ASCII control character U+1D175.
534 ('\xf0\x9d\x85\xb5',
535 None),
536 # 3.25 Plane 0 private use character U+F123.
537 ('\xef\x84\xa3',
538 None),
539 # 3.26 Plane 15 private use character U+F1234.
540 ('\xf3\xb1\x88\xb4',
541 None),
542 # 3.27 Plane 16 private use character U+10F234.
543 ('\xf4\x8f\x88\xb4',
544 None),
545 # 3.28 Non-character code point U+8FFFE.
546 ('\xf2\x8f\xbf\xbe',
547 None),
548 # 3.29 Non-character code point U+10FFFF.
549 ('\xf4\x8f\xbf\xbf',
550 None),
551 # 3.30 Surrogate code U+DF42.
552 ('\xed\xbd\x82',
553 None),
554 # 3.31 Non-plain text character U+FFFD.
555 ('\xef\xbf\xbd',
556 None),
557 # 3.32 Ideographic description character U+2FF5.
558 ('\xe2\xbf\xb5',
559 None),
560 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000561 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000562 '\xcc\x81'),
563 # 3.34 Left-to-right mark U+200E.
564 ('\xe2\x80\x8e',
565 None),
566 # 3.35 Deprecated U+202A.
567 ('\xe2\x80\xaa',
568 None),
569 # 3.36 Language tagging character U+E0001.
570 ('\xf3\xa0\x80\x81',
571 None),
572 # 3.37 Language tagging character U+E0042.
573 ('\xf3\xa0\x81\x82',
574 None),
575 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
576 ('foo\xd6\xbebar',
577 None),
578 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
579 ('foo\xef\xb5\x90bar',
580 None),
581 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
582 ('foo\xef\xb9\xb6bar',
583 'foo \xd9\x8ebar'),
584 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
585 ('\xd8\xa71',
586 None),
587 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
588 ('\xd8\xa71\xd8\xa8',
589 '\xd8\xa71\xd8\xa8'),
590 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000591 # Skip this test as we allow unassigned
592 #('\xf3\xa0\x80\x82',
593 # None),
594 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000595 # 3.44 Larger test (shrinking).
596 # Original test case reads \xc3\xdf
597 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
598 '\xaa\xce\xb0\xe2\x80\x80',
599 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
600 # 3.45 Larger test (expanding).
601 # Original test case reads \xc3\x9f
602 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
603 '\x80',
604 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
605 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
606 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
607 ]
608
609
610class NameprepTest(unittest.TestCase):
611 def test_nameprep(self):
612 from encodings.idna import nameprep
613 for pos, (orig, prepped) in enumerate(nameprep_tests):
614 if orig is None:
615 # Skipped
616 continue
617 # The Unicode strings are given in UTF-8
618 orig = unicode(orig, "utf-8")
619 if prepped is None:
620 # Input contains prohibited characters
621 self.assertRaises(UnicodeError, nameprep, orig)
622 else:
623 prepped = unicode(prepped, "utf-8")
624 try:
625 self.assertEquals(nameprep(orig), prepped)
626 except Exception,e:
627 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
628
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000629class CodecTest(unittest.TestCase):
630 def test_builtin(self):
631 self.assertEquals(unicode("python.org", "idna"), u"python.org")
632
Martin v. Löwis8b595142005-08-25 11:03:38 +0000633 def test_stream(self):
634 import StringIO
635 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
636 r.read(3)
637 self.assertEquals(r.read(), u"")
638
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000639class CodecsModuleTest(unittest.TestCase):
640
641 def test_decode(self):
642 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
643 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000644 self.assertRaises(TypeError, codecs.decode)
645 self.assertEquals(codecs.decode('abc'), u'abc')
646 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
647
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000648 def test_encode(self):
649 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
650 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000651 self.assertRaises(TypeError, codecs.encode)
652 self.assertEquals(codecs.encode(u'abc'), 'abc')
653 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
654
655 def test_register(self):
656 self.assertRaises(TypeError, codecs.register)
657
658 def test_lookup(self):
659 self.assertRaises(TypeError, codecs.lookup)
660 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000661
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000662class StreamReaderTest(unittest.TestCase):
663
664 def setUp(self):
665 self.reader = codecs.getreader('utf-8')
666 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
667
668 def test_readlines(self):
669 f = self.reader(self.stream)
670 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
671
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000672class Str2StrTest(unittest.TestCase):
673
674 def test_read(self):
675 sin = "\x80".encode("base64_codec")
676 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
677 sout = reader.read()
678 self.assertEqual(sout, "\x80")
679 self.assert_(isinstance(sout, str))
680
681 def test_readline(self):
682 sin = "\x80".encode("base64_codec")
683 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
684 sout = reader.readline()
685 self.assertEqual(sout, "\x80")
686 self.assert_(isinstance(sout, str))
687
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000688all_unicode_encodings = [
689 "ascii",
690 "base64_codec",
691 "big5",
692 "big5hkscs",
693 "charmap",
694 "cp037",
695 "cp1006",
696 "cp1026",
697 "cp1140",
698 "cp1250",
699 "cp1251",
700 "cp1252",
701 "cp1253",
702 "cp1254",
703 "cp1255",
704 "cp1256",
705 "cp1257",
706 "cp1258",
707 "cp424",
708 "cp437",
709 "cp500",
710 "cp737",
711 "cp775",
712 "cp850",
713 "cp852",
714 "cp855",
715 "cp856",
716 "cp857",
717 "cp860",
718 "cp861",
719 "cp862",
720 "cp863",
721 "cp864",
722 "cp865",
723 "cp866",
724 "cp869",
725 "cp874",
726 "cp875",
727 "cp932",
728 "cp949",
729 "cp950",
730 "euc_jis_2004",
731 "euc_jisx0213",
732 "euc_jp",
733 "euc_kr",
734 "gb18030",
735 "gb2312",
736 "gbk",
737 "hex_codec",
738 "hp_roman8",
739 "hz",
740 "idna",
741 "iso2022_jp",
742 "iso2022_jp_1",
743 "iso2022_jp_2",
744 "iso2022_jp_2004",
745 "iso2022_jp_3",
746 "iso2022_jp_ext",
747 "iso2022_kr",
748 "iso8859_1",
749 "iso8859_10",
750 "iso8859_11",
751 "iso8859_13",
752 "iso8859_14",
753 "iso8859_15",
754 "iso8859_16",
755 "iso8859_2",
756 "iso8859_3",
757 "iso8859_4",
758 "iso8859_5",
759 "iso8859_6",
760 "iso8859_7",
761 "iso8859_8",
762 "iso8859_9",
763 "johab",
764 "koi8_r",
765 "koi8_u",
766 "latin_1",
767 "mac_cyrillic",
768 "mac_greek",
769 "mac_iceland",
770 "mac_latin2",
771 "mac_roman",
772 "mac_turkish",
773 "palmos",
774 "ptcp154",
775 "punycode",
776 "raw_unicode_escape",
777 "rot_13",
778 "shift_jis",
779 "shift_jis_2004",
780 "shift_jisx0213",
781 "tis_620",
782 "unicode_escape",
783 "unicode_internal",
784 "utf_16",
785 "utf_16_be",
786 "utf_16_le",
787 "utf_7",
788 "utf_8",
789]
790
791if hasattr(codecs, "mbcs_encode"):
792 all_unicode_encodings.append("mbcs")
793
794# The following encodings work only with str, not unicode
795all_string_encodings = [
796 "quopri_codec",
797 "string_escape",
798 "uu_codec",
799]
800
801# The following encoding is not tested, because it's not supposed
802# to work:
803# "undefined"
804
805# The following encodings don't work in stateful mode
806broken_unicode_with_streams = [
807 "base64_codec",
808 "hex_codec",
809 "punycode",
810 "unicode_internal"
811]
812
813try:
814 import bz2
815except ImportError:
816 pass
817else:
818 all_unicode_encodings.append("bz2_codec")
819 broken_unicode_with_streams.append("bz2_codec")
820
821try:
822 import zlib
823except ImportError:
824 pass
825else:
826 all_unicode_encodings.append("zlib_codec")
827 broken_unicode_with_streams.append("zlib_codec")
828
829class BasicUnicodeTest(unittest.TestCase):
830 def test_basics(self):
831 s = u"abc123" # all codecs should be able to encode these
832 for encoding in all_unicode_encodings:
833 (bytes, size) = codecs.getencoder(encoding)(s)
834 if encoding != "unicode_internal":
835 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
836 (chars, size) = codecs.getdecoder(encoding)(bytes)
837 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
838
839 if encoding not in broken_unicode_with_streams:
840 # check stream reader/writer
841 q = Queue()
842 writer = codecs.getwriter(encoding)(q)
843 encodedresult = ""
844 for c in s:
845 writer.write(c)
846 encodedresult += q.read()
847 q = Queue()
848 reader = codecs.getreader(encoding)(q)
849 decodedresult = u""
850 for c in encodedresult:
851 q.write(c)
852 decodedresult += reader.read()
853 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
854
Walter Dörwald729c31f2005-03-14 19:06:30 +0000855 def test_seek(self):
856 # all codecs should be able to encode these
857 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
858 for encoding in all_unicode_encodings:
859 if encoding == "idna": # FIXME: See SF bug #1163178
860 continue
861 if encoding in broken_unicode_with_streams:
862 continue
863 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
864 for t in xrange(5):
865 # Test that calling seek resets the internal codec state and buffers
866 reader.seek(0, 0)
867 line = reader.readline()
868 self.assertEqual(s[:len(line)], line)
869
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000870class BasicStrTest(unittest.TestCase):
871 def test_basics(self):
872 s = "abc123"
873 for encoding in all_string_encodings:
874 (bytes, size) = codecs.getencoder(encoding)(s)
875 self.assertEqual(size, len(s))
876 (chars, size) = codecs.getdecoder(encoding)(bytes)
877 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
878
Fred Drake2e2be372001-09-20 21:33:42 +0000879def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000880 test_support.run_unittest(
881 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000882 UTF16LETest,
883 UTF16BETest,
884 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000885 EscapeDecodeTest,
886 RecodingTest,
887 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000888 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000889 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000890 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000891 StreamReaderTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000892 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000893 BasicUnicodeTest,
894 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000895 )
Fred Drake2e2be372001-09-20 21:33:42 +0000896
897
898if __name__ == "__main__":
899 test_main()