blob: 692e9b5f30ccade6799721fb4438602a968d7dec [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
4import StringIO
5
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwalde57d7b12004-12-21 22:24:00 +000044 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
48
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
58
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +000076 s = 10*(size*u"a" + lineend + u"xxx\n")
77 reader = getreader(s)
78 for i in xrange(10):
79 self.assertEqual(
80 reader.readline(keepends=True),
81 size*u"a" + lineend,
82 )
83 reader = getreader(s)
84 for i in xrange(10):
85 self.assertEqual(
86 reader.readline(keepends=False),
87 size*u"a",
88 )
89
90 def test_bug1175396(self):
91 s = [
92 '<%!--===================================================\r\n',
93 ' BLOG index page: show recent articles,\r\n',
94 ' today\'s articles, or articles of a specific date.\r\n',
95 '========================================================--%>\r\n',
96 '<%@inputencoding="ISO-8859-1"%>\r\n',
97 '<%@pagetemplate=TEMPLATE.y%>\r\n',
98 '<%@import=import frog.util, frog%>\r\n',
99 '<%@import=import frog.objects%>\r\n',
100 '<%@import=from frog.storageerrors import StorageError%>\r\n',
101 '<%\r\n',
102 '\r\n',
103 'import logging\r\n',
104 'log=logging.getLogger("Snakelets.logger")\r\n',
105 '\r\n',
106 '\r\n',
107 'user=self.SessionCtx.user\r\n',
108 'storageEngine=self.SessionCtx.storageEngine\r\n',
109 '\r\n',
110 '\r\n',
111 'def readArticlesFromDate(date, count=None):\r\n',
112 ' entryids=storageEngine.listBlogEntries(date)\r\n',
113 ' entryids.reverse() # descending\r\n',
114 ' if count:\r\n',
115 ' entryids=entryids[:count]\r\n',
116 ' try:\r\n',
117 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
118 ' except StorageError,x:\r\n',
119 ' log.error("Error loading articles: "+str(x))\r\n',
120 ' self.abort("cannot load articles")\r\n',
121 '\r\n',
122 'showdate=None\r\n',
123 '\r\n',
124 'arg=self.Request.getArg()\r\n',
125 'if arg=="today":\r\n',
126 ' #-------------------- TODAY\'S ARTICLES\r\n',
127 ' self.write("<h2>Today\'s articles</h2>")\r\n',
128 ' showdate = frog.util.isodatestr() \r\n',
129 ' entries = readArticlesFromDate(showdate)\r\n',
130 'elif arg=="active":\r\n',
131 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
132 ' self.Yredirect("active.y")\r\n',
133 'elif arg=="login":\r\n',
134 ' #-------------------- LOGIN PAGE redirect\r\n',
135 ' self.Yredirect("login.y")\r\n',
136 'elif arg=="date":\r\n',
137 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
138 ' showdate = self.Request.getParameter("date")\r\n',
139 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
140 ' entries = readArticlesFromDate(showdate)\r\n',
141 'else:\r\n',
142 ' #-------------------- RECENT ARTICLES\r\n',
143 ' self.write("<h2>Recent articles</h2>")\r\n',
144 ' dates=storageEngine.listBlogEntryDates()\r\n',
145 ' if dates:\r\n',
146 ' entries=[]\r\n',
147 ' SHOWAMOUNT=10\r\n',
148 ' for showdate in dates:\r\n',
149 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
150 ' if len(entries)>=SHOWAMOUNT:\r\n',
151 ' break\r\n',
152 ' \r\n',
153 ]
154 stream = StringIO.StringIO("".join(s).encode(self.encoding))
155 reader = codecs.getreader(self.encoding)(stream)
156 for (i, line) in enumerate(reader):
157 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000158
159 def test_readlinequeue(self):
160 q = Queue()
161 writer = codecs.getwriter(self.encoding)(q)
162 reader = codecs.getreader(self.encoding)(q)
163
164 # No lineends
165 writer.write(u"foo\r")
166 self.assertEqual(reader.readline(keepends=False), u"foo")
167 writer.write(u"\nbar\r")
168 self.assertEqual(reader.readline(keepends=False), u"bar")
169 writer.write(u"baz")
170 self.assertEqual(reader.readline(keepends=False), u"baz")
171 self.assertEqual(reader.readline(keepends=False), u"")
172
173 # Lineends
174 writer.write(u"foo\r")
175 self.assertEqual(reader.readline(keepends=True), u"foo\r")
176 writer.write(u"\nbar\r")
177 self.assertEqual(reader.readline(keepends=True), u"bar\r")
178 writer.write(u"baz")
179 self.assertEqual(reader.readline(keepends=True), u"baz")
180 self.assertEqual(reader.readline(keepends=True), u"")
181 writer.write(u"foo\r\n")
182 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
183
Walter Dörwald9fa09462005-01-10 12:01:39 +0000184 def test_bug1098990_a(self):
185 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
186 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
187 s3 = u"next line.\r\n"
188
189 s = (s1+s2+s3).encode(self.encoding)
190 stream = StringIO.StringIO(s)
191 reader = codecs.getreader(self.encoding)(stream)
192 self.assertEqual(reader.readline(), s1)
193 self.assertEqual(reader.readline(), s2)
194 self.assertEqual(reader.readline(), s3)
195 self.assertEqual(reader.readline(), u"")
196
197 def test_bug1098990_b(self):
198 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
199 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
200 s3 = u"stillokay:bbbbxx\r\n"
201 s4 = u"broken!!!!badbad\r\n"
202 s5 = u"againokay.\r\n"
203
204 s = (s1+s2+s3+s4+s5).encode(self.encoding)
205 stream = StringIO.StringIO(s)
206 reader = codecs.getreader(self.encoding)(stream)
207 self.assertEqual(reader.readline(), s1)
208 self.assertEqual(reader.readline(), s2)
209 self.assertEqual(reader.readline(), s3)
210 self.assertEqual(reader.readline(), s4)
211 self.assertEqual(reader.readline(), s5)
212 self.assertEqual(reader.readline(), u"")
213
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000214class UTF16Test(ReadTest):
215 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000216
217 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
218 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
219
220 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000221 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000222 # encode some stream
223 s = StringIO.StringIO()
224 f = writer(s)
225 f.write(u"spam")
226 f.write(u"spam")
227 d = s.getvalue()
228 # check whether there is exactly one BOM in it
229 self.assert_(d == self.spamle or d == self.spambe)
230 # try to read it back
231 s = StringIO.StringIO(d)
232 f = reader(s)
233 self.assertEquals(f.read(), u"spamspam")
234
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000235 def test_badbom(self):
236 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000237 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000238 self.assertRaises(UnicodeError, f.read)
239
240 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000241 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000242 self.assertRaises(UnicodeError, f.read)
243
Walter Dörwald69652032004-09-07 20:24:22 +0000244 def test_partial(self):
245 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000246 u"\x00\xff\u0100\uffff",
247 [
248 u"", # first byte of BOM read
249 u"", # second byte of BOM read => byteorder known
250 u"",
251 u"\x00",
252 u"\x00",
253 u"\x00\xff",
254 u"\x00\xff",
255 u"\x00\xff\u0100",
256 u"\x00\xff\u0100",
257 u"\x00\xff\u0100\uffff",
258 ]
259 )
260
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000261class UTF16LETest(ReadTest):
262 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000263
264 def test_partial(self):
265 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000266 u"\x00\xff\u0100\uffff",
267 [
268 u"",
269 u"\x00",
270 u"\x00",
271 u"\x00\xff",
272 u"\x00\xff",
273 u"\x00\xff\u0100",
274 u"\x00\xff\u0100",
275 u"\x00\xff\u0100\uffff",
276 ]
277 )
278
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000279class UTF16BETest(ReadTest):
280 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000281
282 def test_partial(self):
283 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000284 u"\x00\xff\u0100\uffff",
285 [
286 u"",
287 u"\x00",
288 u"\x00",
289 u"\x00\xff",
290 u"\x00\xff",
291 u"\x00\xff\u0100",
292 u"\x00\xff\u0100",
293 u"\x00\xff\u0100\uffff",
294 ]
295 )
296
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000297class UTF8Test(ReadTest):
298 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000299
300 def test_partial(self):
301 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000302 u"\x00\xff\u07ff\u0800\uffff",
303 [
304 u"\x00",
305 u"\x00",
306 u"\x00\xff",
307 u"\x00\xff",
308 u"\x00\xff\u07ff",
309 u"\x00\xff\u07ff",
310 u"\x00\xff\u07ff",
311 u"\x00\xff\u07ff\u0800",
312 u"\x00\xff\u07ff\u0800",
313 u"\x00\xff\u07ff\u0800",
314 u"\x00\xff\u07ff\u0800\uffff",
315 ]
316 )
317
Walter Dörwald8709a422002-09-03 13:53:40 +0000318class EscapeDecodeTest(unittest.TestCase):
319 def test_empty_escape_decode(self):
320 self.assertEquals(codecs.escape_decode(""), ("", 0))
321
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000322class RecodingTest(unittest.TestCase):
323 def test_recoding(self):
324 f = StringIO.StringIO()
325 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
326 f2.write(u"a")
327 f2.close()
328 # Python used to crash on this at exit because of a refcount
329 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000330
Martin v. Löwis2548c732003-04-18 10:39:54 +0000331# From RFC 3492
332punycode_testcases = [
333 # A Arabic (Egyptian):
334 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
335 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
336 "egbpdaj6bu4bxfgehfvwxn"),
337 # B Chinese (simplified):
338 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
339 "ihqwcrb4cv8a8dqg056pqjye"),
340 # C Chinese (traditional):
341 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
342 "ihqwctvzc91f659drss3x8bo0yb"),
343 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
344 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
345 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
346 u"\u0065\u0073\u006B\u0079",
347 "Proprostnemluvesky-uyb24dma41a"),
348 # E Hebrew:
349 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
350 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
351 u"\u05D1\u05E8\u05D9\u05EA",
352 "4dbcagdahymbxekheh6e0a7fei0b"),
353 # F Hindi (Devanagari):
354 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
355 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
356 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
357 u"\u0939\u0948\u0902",
358 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
359
360 #(G) Japanese (kanji and hiragana):
361 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
362 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
363 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
364
365 # (H) Korean (Hangul syllables):
366 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
367 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
368 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
369 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
370 "psd879ccm6fea98c"),
371
372 # (I) Russian (Cyrillic):
373 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
374 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
375 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
376 u"\u0438",
377 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
378
379 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
380 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
381 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
382 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
383 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
384 u"\u0061\u00F1\u006F\u006C",
385 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
386
387 # (K) Vietnamese:
388 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
389 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
390 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
391 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
392 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
393 u"\u0056\u0069\u1EC7\u0074",
394 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
395
396
397 #(L) 3<nen>B<gumi><kinpachi><sensei>
398 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
399 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000400
Martin v. Löwis2548c732003-04-18 10:39:54 +0000401 # (M) <amuro><namie>-with-SUPER-MONKEYS
402 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
403 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
404 u"\u004F\u004E\u004B\u0045\u0059\u0053",
405 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
406
407 # (N) Hello-Another-Way-<sorezore><no><basho>
408 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
409 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
410 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
411 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
412
413 # (O) <hitotsu><yane><no><shita>2
414 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
415 "2-u9tlzr9756bt3uc0v"),
416
417 # (P) Maji<de>Koi<suru>5<byou><mae>
418 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
419 u"\u308B\u0035\u79D2\u524D",
420 "MajiKoi5-783gue6qz075azm5e"),
421
422 # (Q) <pafii>de<runba>
423 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
424 "de-jg4avhby1noc0d"),
425
426 # (R) <sono><supiido><de>
427 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
428 "d9juau41awczczp"),
429
430 # (S) -> $1.00 <-
431 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
432 u"\u003C\u002D",
433 "-> $1.00 <--")
434 ]
435
436for i in punycode_testcases:
437 if len(i)!=2:
438 print repr(i)
439
440class PunycodeTest(unittest.TestCase):
441 def test_encode(self):
442 for uni, puny in punycode_testcases:
443 # Need to convert both strings to lower case, since
444 # some of the extended encodings use upper case, but our
445 # code produces only lower case. Converting just puny to
446 # lower is also insufficient, since some of the input characters
447 # are upper case.
448 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
449
450 def test_decode(self):
451 for uni, puny in punycode_testcases:
452 self.assertEquals(uni, puny.decode("punycode"))
453
454# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
455nameprep_tests = [
456 # 3.1 Map to nothing.
457 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
458 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
459 '\xb8\x8f\xef\xbb\xbf',
460 'foobarbaz'),
461 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
462 ('CAFE',
463 'cafe'),
464 # 3.3 Case folding 8bit U+00DF (german sharp s).
465 # The original test case is bogus; it says \xc3\xdf
466 ('\xc3\x9f',
467 'ss'),
468 # 3.4 Case folding U+0130 (turkish capital I with dot).
469 ('\xc4\xb0',
470 'i\xcc\x87'),
471 # 3.5 Case folding multibyte U+0143 U+037A.
472 ('\xc5\x83\xcd\xba',
473 '\xc5\x84 \xce\xb9'),
474 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
475 # XXX: skip this as it fails in UCS-2 mode
476 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
477 # 'telc\xe2\x88\x95kg\xcf\x83'),
478 (None, None),
479 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
480 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
481 '\xc7\xb0 a'),
482 # 3.8 Case folding U+1FB7 and normalization.
483 ('\xe1\xbe\xb7',
484 '\xe1\xbe\xb6\xce\xb9'),
485 # 3.9 Self-reverting case folding U+01F0 and normalization.
486 # The original test case is bogus, it says `\xc7\xf0'
487 ('\xc7\xb0',
488 '\xc7\xb0'),
489 # 3.10 Self-reverting case folding U+0390 and normalization.
490 ('\xce\x90',
491 '\xce\x90'),
492 # 3.11 Self-reverting case folding U+03B0 and normalization.
493 ('\xce\xb0',
494 '\xce\xb0'),
495 # 3.12 Self-reverting case folding U+1E96 and normalization.
496 ('\xe1\xba\x96',
497 '\xe1\xba\x96'),
498 # 3.13 Self-reverting case folding U+1F56 and normalization.
499 ('\xe1\xbd\x96',
500 '\xe1\xbd\x96'),
501 # 3.14 ASCII space character U+0020.
502 (' ',
503 ' '),
504 # 3.15 Non-ASCII 8bit space character U+00A0.
505 ('\xc2\xa0',
506 ' '),
507 # 3.16 Non-ASCII multibyte space character U+1680.
508 ('\xe1\x9a\x80',
509 None),
510 # 3.17 Non-ASCII multibyte space character U+2000.
511 ('\xe2\x80\x80',
512 ' '),
513 # 3.18 Zero Width Space U+200b.
514 ('\xe2\x80\x8b',
515 ''),
516 # 3.19 Non-ASCII multibyte space character U+3000.
517 ('\xe3\x80\x80',
518 ' '),
519 # 3.20 ASCII control characters U+0010 U+007F.
520 ('\x10\x7f',
521 '\x10\x7f'),
522 # 3.21 Non-ASCII 8bit control character U+0085.
523 ('\xc2\x85',
524 None),
525 # 3.22 Non-ASCII multibyte control character U+180E.
526 ('\xe1\xa0\x8e',
527 None),
528 # 3.23 Zero Width No-Break Space U+FEFF.
529 ('\xef\xbb\xbf',
530 ''),
531 # 3.24 Non-ASCII control character U+1D175.
532 ('\xf0\x9d\x85\xb5',
533 None),
534 # 3.25 Plane 0 private use character U+F123.
535 ('\xef\x84\xa3',
536 None),
537 # 3.26 Plane 15 private use character U+F1234.
538 ('\xf3\xb1\x88\xb4',
539 None),
540 # 3.27 Plane 16 private use character U+10F234.
541 ('\xf4\x8f\x88\xb4',
542 None),
543 # 3.28 Non-character code point U+8FFFE.
544 ('\xf2\x8f\xbf\xbe',
545 None),
546 # 3.29 Non-character code point U+10FFFF.
547 ('\xf4\x8f\xbf\xbf',
548 None),
549 # 3.30 Surrogate code U+DF42.
550 ('\xed\xbd\x82',
551 None),
552 # 3.31 Non-plain text character U+FFFD.
553 ('\xef\xbf\xbd',
554 None),
555 # 3.32 Ideographic description character U+2FF5.
556 ('\xe2\xbf\xb5',
557 None),
558 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000559 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000560 '\xcc\x81'),
561 # 3.34 Left-to-right mark U+200E.
562 ('\xe2\x80\x8e',
563 None),
564 # 3.35 Deprecated U+202A.
565 ('\xe2\x80\xaa',
566 None),
567 # 3.36 Language tagging character U+E0001.
568 ('\xf3\xa0\x80\x81',
569 None),
570 # 3.37 Language tagging character U+E0042.
571 ('\xf3\xa0\x81\x82',
572 None),
573 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
574 ('foo\xd6\xbebar',
575 None),
576 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
577 ('foo\xef\xb5\x90bar',
578 None),
579 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
580 ('foo\xef\xb9\xb6bar',
581 'foo \xd9\x8ebar'),
582 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
583 ('\xd8\xa71',
584 None),
585 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
586 ('\xd8\xa71\xd8\xa8',
587 '\xd8\xa71\xd8\xa8'),
588 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000589 # Skip this test as we allow unassigned
590 #('\xf3\xa0\x80\x82',
591 # None),
592 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000593 # 3.44 Larger test (shrinking).
594 # Original test case reads \xc3\xdf
595 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
596 '\xaa\xce\xb0\xe2\x80\x80',
597 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
598 # 3.45 Larger test (expanding).
599 # Original test case reads \xc3\x9f
600 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
601 '\x80',
602 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
603 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
604 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
605 ]
606
607
608class NameprepTest(unittest.TestCase):
609 def test_nameprep(self):
610 from encodings.idna import nameprep
611 for pos, (orig, prepped) in enumerate(nameprep_tests):
612 if orig is None:
613 # Skipped
614 continue
615 # The Unicode strings are given in UTF-8
616 orig = unicode(orig, "utf-8")
617 if prepped is None:
618 # Input contains prohibited characters
619 self.assertRaises(UnicodeError, nameprep, orig)
620 else:
621 prepped = unicode(prepped, "utf-8")
622 try:
623 self.assertEquals(nameprep(orig), prepped)
624 except Exception,e:
625 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
626
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000627class CodecTest(unittest.TestCase):
628 def test_builtin(self):
629 self.assertEquals(unicode("python.org", "idna"), u"python.org")
630
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000631class CodecsModuleTest(unittest.TestCase):
632
633 def test_decode(self):
634 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
635 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000636 self.assertRaises(TypeError, codecs.decode)
637 self.assertEquals(codecs.decode('abc'), u'abc')
638 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
639
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000640 def test_encode(self):
641 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
642 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +0000643 self.assertRaises(TypeError, codecs.encode)
644 self.assertEquals(codecs.encode(u'abc'), 'abc')
645 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
646
647 def test_register(self):
648 self.assertRaises(TypeError, codecs.register)
649
650 def test_lookup(self):
651 self.assertRaises(TypeError, codecs.lookup)
652 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000653
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000654class StreamReaderTest(unittest.TestCase):
655
656 def setUp(self):
657 self.reader = codecs.getreader('utf-8')
658 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
659
660 def test_readlines(self):
661 f = self.reader(self.stream)
662 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
663
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000664all_unicode_encodings = [
665 "ascii",
666 "base64_codec",
667 "big5",
668 "big5hkscs",
669 "charmap",
670 "cp037",
671 "cp1006",
672 "cp1026",
673 "cp1140",
674 "cp1250",
675 "cp1251",
676 "cp1252",
677 "cp1253",
678 "cp1254",
679 "cp1255",
680 "cp1256",
681 "cp1257",
682 "cp1258",
683 "cp424",
684 "cp437",
685 "cp500",
686 "cp737",
687 "cp775",
688 "cp850",
689 "cp852",
690 "cp855",
691 "cp856",
692 "cp857",
693 "cp860",
694 "cp861",
695 "cp862",
696 "cp863",
697 "cp864",
698 "cp865",
699 "cp866",
700 "cp869",
701 "cp874",
702 "cp875",
703 "cp932",
704 "cp949",
705 "cp950",
706 "euc_jis_2004",
707 "euc_jisx0213",
708 "euc_jp",
709 "euc_kr",
710 "gb18030",
711 "gb2312",
712 "gbk",
713 "hex_codec",
714 "hp_roman8",
715 "hz",
716 "idna",
717 "iso2022_jp",
718 "iso2022_jp_1",
719 "iso2022_jp_2",
720 "iso2022_jp_2004",
721 "iso2022_jp_3",
722 "iso2022_jp_ext",
723 "iso2022_kr",
724 "iso8859_1",
725 "iso8859_10",
726 "iso8859_11",
727 "iso8859_13",
728 "iso8859_14",
729 "iso8859_15",
730 "iso8859_16",
731 "iso8859_2",
732 "iso8859_3",
733 "iso8859_4",
734 "iso8859_5",
735 "iso8859_6",
736 "iso8859_7",
737 "iso8859_8",
738 "iso8859_9",
739 "johab",
740 "koi8_r",
741 "koi8_u",
742 "latin_1",
743 "mac_cyrillic",
744 "mac_greek",
745 "mac_iceland",
746 "mac_latin2",
747 "mac_roman",
748 "mac_turkish",
749 "palmos",
750 "ptcp154",
751 "punycode",
752 "raw_unicode_escape",
753 "rot_13",
754 "shift_jis",
755 "shift_jis_2004",
756 "shift_jisx0213",
757 "tis_620",
758 "unicode_escape",
759 "unicode_internal",
760 "utf_16",
761 "utf_16_be",
762 "utf_16_le",
763 "utf_7",
764 "utf_8",
765]
766
767if hasattr(codecs, "mbcs_encode"):
768 all_unicode_encodings.append("mbcs")
769
770# The following encodings work only with str, not unicode
771all_string_encodings = [
772 "quopri_codec",
773 "string_escape",
774 "uu_codec",
775]
776
777# The following encoding is not tested, because it's not supposed
778# to work:
779# "undefined"
780
781# The following encodings don't work in stateful mode
782broken_unicode_with_streams = [
783 "base64_codec",
784 "hex_codec",
785 "punycode",
786 "unicode_internal"
787]
788
789try:
790 import bz2
791except ImportError:
792 pass
793else:
794 all_unicode_encodings.append("bz2_codec")
795 broken_unicode_with_streams.append("bz2_codec")
796
797try:
798 import zlib
799except ImportError:
800 pass
801else:
802 all_unicode_encodings.append("zlib_codec")
803 broken_unicode_with_streams.append("zlib_codec")
804
805class BasicUnicodeTest(unittest.TestCase):
806 def test_basics(self):
807 s = u"abc123" # all codecs should be able to encode these
808 for encoding in all_unicode_encodings:
809 (bytes, size) = codecs.getencoder(encoding)(s)
810 if encoding != "unicode_internal":
811 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
812 (chars, size) = codecs.getdecoder(encoding)(bytes)
813 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
814
815 if encoding not in broken_unicode_with_streams:
816 # check stream reader/writer
817 q = Queue()
818 writer = codecs.getwriter(encoding)(q)
819 encodedresult = ""
820 for c in s:
821 writer.write(c)
822 encodedresult += q.read()
823 q = Queue()
824 reader = codecs.getreader(encoding)(q)
825 decodedresult = u""
826 for c in encodedresult:
827 q.write(c)
828 decodedresult += reader.read()
829 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
830
Walter Dörwald729c31f2005-03-14 19:06:30 +0000831 def test_seek(self):
832 # all codecs should be able to encode these
833 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
834 for encoding in all_unicode_encodings:
835 if encoding == "idna": # FIXME: See SF bug #1163178
836 continue
837 if encoding in broken_unicode_with_streams:
838 continue
839 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
840 for t in xrange(5):
841 # Test that calling seek resets the internal codec state and buffers
842 reader.seek(0, 0)
843 line = reader.readline()
844 self.assertEqual(s[:len(line)], line)
845
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000846class BasicStrTest(unittest.TestCase):
847 def test_basics(self):
848 s = "abc123"
849 for encoding in all_string_encodings:
850 (bytes, size) = codecs.getencoder(encoding)(s)
851 self.assertEqual(size, len(s))
852 (chars, size) = codecs.getdecoder(encoding)(bytes)
853 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
854
Fred Drake2e2be372001-09-20 21:33:42 +0000855def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +0000856 test_support.run_unittest(
857 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +0000858 UTF16LETest,
859 UTF16BETest,
860 UTF8Test,
Walter Dörwald21d3a322003-05-01 17:45:56 +0000861 EscapeDecodeTest,
862 RecodingTest,
863 PunycodeTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000864 NameprepTest,
Marc-André Lemburg3f419742004-07-10 12:06:10 +0000865 CodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000866 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +0000867 StreamReaderTest,
868 BasicUnicodeTest,
869 BasicStrTest
Walter Dörwald21d3a322003-05-01 17:45:56 +0000870 )
Fred Drake2e2be372001-09-20 21:33:42 +0000871
872
873if __name__ == "__main__":
874 test_main()