blob: 517a68279fdcf7f612c0a62705f8457e28ab4259 [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001#!/usr/bin/env python
2#
3# test_multibytecodec.py
4# Unit test for multibytecodec itself
5#
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00006
7from test import test_support
Hye-Shik Chang84392be2006-07-06 15:21:52 +00008from test.test_support import TESTFN
9import unittest, StringIO, codecs, sys, os
Georg Brandlb9b68ae2008-07-16 22:04:20 +000010import _multibytecodec
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +000011
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000012ALL_CJKENCODINGS = [
13# _codecs_cn
14 'gb2312', 'gbk', 'gb18030', 'hz',
15# _codecs_hk
16 'big5hkscs',
17# _codecs_jp
18 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
19 'euc_jis_2004', 'shift_jis_2004',
20# _codecs_kr
21 'cp949', 'euc_kr', 'johab',
22# _codecs_tw
23 'big5', 'cp950',
24# _codecs_iso2022
25 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
26 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
27]
28
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000029class Test_MultibyteCodec(unittest.TestCase):
30
31 def test_nullcoding(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000032 for enc in ALL_CJKENCODINGS:
33 self.assertEqual(''.decode(enc), u'')
34 self.assertEqual(unicode('', enc), u'')
35 self.assertEqual(u''.encode(enc), '')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000036
37 def test_str_decode(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000038 for enc in ALL_CJKENCODINGS:
39 self.assertEqual('abcd'.encode(enc), 'abcd')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000040
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000041 def test_errorcallback_longindex(self):
42 dec = codecs.getdecoder('euc-kr')
43 myreplace = lambda exc: (u'', sys.maxint+1)
44 codecs.register_error('test.cjktest', myreplace)
45 self.assertRaises(IndexError, dec,
46 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000047
Hye-Shik Chang84392be2006-07-06 15:21:52 +000048 def test_codingspec(self):
Hye-Shik Chang84392be2006-07-06 15:21:52 +000049 try:
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000050 for enc in ALL_CJKENCODINGS:
51 print >> open(TESTFN, 'w'), '# coding:', enc
52 exec open(TESTFN)
Hye-Shik Chang84392be2006-07-06 15:21:52 +000053 finally:
54 os.unlink(TESTFN)
55
Georg Brandlb9b68ae2008-07-16 22:04:20 +000056 def test_init_segfault(self):
57 # bug #3305: this used to segfault
58 self.assertRaises(AttributeError,
59 _multibytecodec.MultibyteStreamReader, None)
60 self.assertRaises(AttributeError,
61 _multibytecodec.MultibyteStreamWriter, None)
62
63
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000064class Test_IncrementalEncoder(unittest.TestCase):
65
66 def test_stateless(self):
67 # cp949 encoder isn't stateful at all.
68 encoder = codecs.getincrementalencoder('cp949')()
69 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
70 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
71 self.assertEqual(encoder.reset(), None)
72 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
73 '\xa1\xd9\xa1\xad\xa1\xd9')
74 self.assertEqual(encoder.reset(), None)
75 self.assertEqual(encoder.encode(u'', True), '')
76 self.assertEqual(encoder.encode(u'', False), '')
77 self.assertEqual(encoder.reset(), None)
78
79 def test_stateful(self):
80 # jisx0213 encoder is stateful for a few codepoints. eg)
81 # U+00E6 => A9DC
82 # U+00E6 U+0300 => ABC4
83 # U+0300 => ABDC
84
85 encoder = codecs.getincrementalencoder('jisx0213')()
86 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
87 self.assertEqual(encoder.encode(u'\u00e6'), '')
88 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
89 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
90
91 self.assertEqual(encoder.reset(), None)
92 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
93
94 self.assertEqual(encoder.encode(u'\u00e6'), '')
95 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
96 self.assertEqual(encoder.encode('', True), '')
97
98 def test_stateful_keep_buffer(self):
99 encoder = codecs.getincrementalencoder('jisx0213')()
100 self.assertEqual(encoder.encode(u'\u00e6'), '')
101 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
102 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
103 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
104 self.assertEqual(encoder.reset(), None)
105 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
106 self.assertEqual(encoder.encode(u'\u00e6'), '')
107 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
108 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
109
110
111class Test_IncrementalDecoder(unittest.TestCase):
112
113 def test_dbcs(self):
114 # cp949 decoder is simple with only 1 or 2 bytes sequences.
115 decoder = codecs.getincrementaldecoder('cp949')()
116 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
117 u'\ud30c\uc774')
118 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
119 u'\uc36c \ub9c8\uc744')
120 self.assertEqual(decoder.decode(''), u'')
121
122 def test_dbcs_keep_buffer(self):
123 decoder = codecs.getincrementaldecoder('cp949')()
124 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
125 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
126 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
127
128 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
129 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
130 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
131
132 def test_iso2022(self):
133 decoder = codecs.getincrementaldecoder('iso2022-jp')()
134 ESC = '\x1b'
135 self.assertEqual(decoder.decode(ESC + '('), u'')
136 self.assertEqual(decoder.decode('B', True), u'')
137 self.assertEqual(decoder.decode(ESC + '$'), u'')
138 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
139 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
140 self.assertEqual(decoder.decode('$', True), u'\u4e16')
141 self.assertEqual(decoder.reset(), None)
142 self.assertEqual(decoder.decode('@$'), u'@$')
143 self.assertEqual(decoder.decode(ESC + '$'), u'')
144 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
145 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
146
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000147class Test_StreamReader(unittest.TestCase):
148 def test_bug1728403(self):
149 try:
150 open(TESTFN, 'w').write('\xa1')
151 f = codecs.open(TESTFN, encoding='cp949')
152 self.assertRaises(UnicodeDecodeError, f.read, 2)
153 finally:
Hye-Shik Changf9a0ea82007-06-05 19:28:15 +0000154 try: f.close()
155 except: pass
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000156 os.unlink(TESTFN)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000157
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000158class Test_StreamWriter(unittest.TestCase):
159 if len(u'\U00012345') == 2: # UCS2
160 def test_gb18030(self):
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000161 s = StringIO.StringIO()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000162 c = codecs.getwriter('gb18030')(s)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000163 c.write(u'123')
164 self.assertEqual(s.getvalue(), '123')
165 c.write(u'\U00012345')
166 self.assertEqual(s.getvalue(), '123\x907\x959')
167 c.write(u'\U00012345'[0])
168 self.assertEqual(s.getvalue(), '123\x907\x959')
169 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
170 self.assertEqual(s.getvalue(),
171 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
172 c.write(u'\U00012345'[0])
173 self.assertEqual(s.getvalue(),
174 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
175 self.assertRaises(UnicodeError, c.reset)
176 self.assertEqual(s.getvalue(),
177 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
178
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000179 def test_utf_8(self):
180 s= StringIO.StringIO()
181 c = codecs.getwriter('utf-8')(s)
182 c.write(u'123')
183 self.assertEqual(s.getvalue(), '123')
184 c.write(u'\U00012345')
185 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
186
187 # Python utf-8 codec can't buffer surrogate pairs yet.
188 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000189 c.write(u'\U00012345'[0])
190 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
191 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
192 self.assertEqual(s.getvalue(),
193 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
194 '\xea\xb0\x80\xc2\xac')
195 c.write(u'\U00012345'[0])
196 self.assertEqual(s.getvalue(),
197 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
198 '\xea\xb0\x80\xc2\xac')
199 c.reset()
200 self.assertEqual(s.getvalue(),
201 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
202 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
203 c.write(u'\U00012345'[1])
204 self.assertEqual(s.getvalue(),
205 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
206 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
207
208 else: # UCS4
209 pass
210
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000211 def test_streamwriter_strwrite(self):
212 s = StringIO.StringIO()
213 wr = codecs.getwriter('gb18030')(s)
214 wr.write('abcd')
215 self.assertEqual(s.getvalue(), 'abcd')
216
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000217class Test_ISO2022(unittest.TestCase):
218 def test_g2(self):
219 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
220 uni = u':hu4:unit\xe9 de famille'
221 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
222
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000223 def test_iso2022_jp_g0(self):
Ezio Melottiaa980582010-01-23 23:04:36 +0000224 self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000225 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
226 e = u'\u3406'.encode(encoding)
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000227 self.assertFalse(filter(lambda x: x >= '\x80', e))
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000228
Hye-Shik Changb7883462006-10-08 13:48:34 +0000229 def test_bug1572832(self):
230 if sys.maxunicode >= 0x10000:
231 myunichr = unichr
232 else:
233 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
234
235 for x in xrange(0x10000, 0x110000):
236 # Any ISO 2022 codec will cause the segfault
237 myunichr(x).encode('iso_2022_jp', 'ignore')
238
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000239def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000240 test_support.run_unittest(__name__)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000241
242if __name__ == "__main__":
243 test_main()