blob: 090374cdafb0d752fb756b3b0f6cdd7dbe74e8cf [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001#!/usr/bin/env python
2#
3# test_multibytecodec.py
4# Unit test for multibytecodec itself
5#
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00006
7from test import test_support
8from test import test_multibytecodec_support
Hye-Shik Chang84392be2006-07-06 15:21:52 +00009from test.test_support import TESTFN
10import unittest, StringIO, codecs, sys, os
Georg Brandlb9b68ae2008-07-16 22:04:20 +000011import _multibytecodec
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +000012
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000013ALL_CJKENCODINGS = [
14# _codecs_cn
15 'gb2312', 'gbk', 'gb18030', 'hz',
16# _codecs_hk
17 'big5hkscs',
18# _codecs_jp
19 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
20 'euc_jis_2004', 'shift_jis_2004',
21# _codecs_kr
22 'cp949', 'euc_kr', 'johab',
23# _codecs_tw
24 'big5', 'cp950',
25# _codecs_iso2022
26 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
27 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
28]
29
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000030class Test_MultibyteCodec(unittest.TestCase):
31
32 def test_nullcoding(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000033 for enc in ALL_CJKENCODINGS:
34 self.assertEqual(''.decode(enc), u'')
35 self.assertEqual(unicode('', enc), u'')
36 self.assertEqual(u''.encode(enc), '')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000037
38 def test_str_decode(self):
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000039 for enc in ALL_CJKENCODINGS:
40 self.assertEqual('abcd'.encode(enc), 'abcd')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000041
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000042 def test_errorcallback_longindex(self):
43 dec = codecs.getdecoder('euc-kr')
44 myreplace = lambda exc: (u'', sys.maxint+1)
45 codecs.register_error('test.cjktest', myreplace)
46 self.assertRaises(IndexError, dec,
47 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000048
Hye-Shik Chang84392be2006-07-06 15:21:52 +000049 def test_codingspec(self):
Hye-Shik Chang84392be2006-07-06 15:21:52 +000050 try:
Hye-Shik Changb9aa7ea2006-07-06 15:39:24 +000051 for enc in ALL_CJKENCODINGS:
52 print >> open(TESTFN, 'w'), '# coding:', enc
53 exec open(TESTFN)
Hye-Shik Chang84392be2006-07-06 15:21:52 +000054 finally:
55 os.unlink(TESTFN)
56
Georg Brandlb9b68ae2008-07-16 22:04:20 +000057 def test_init_segfault(self):
58 # bug #3305: this used to segfault
59 self.assertRaises(AttributeError,
60 _multibytecodec.MultibyteStreamReader, None)
61 self.assertRaises(AttributeError,
62 _multibytecodec.MultibyteStreamWriter, None)
63
64
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000065class Test_IncrementalEncoder(unittest.TestCase):
66
67 def test_stateless(self):
68 # cp949 encoder isn't stateful at all.
69 encoder = codecs.getincrementalencoder('cp949')()
70 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
71 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
72 self.assertEqual(encoder.reset(), None)
73 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
74 '\xa1\xd9\xa1\xad\xa1\xd9')
75 self.assertEqual(encoder.reset(), None)
76 self.assertEqual(encoder.encode(u'', True), '')
77 self.assertEqual(encoder.encode(u'', False), '')
78 self.assertEqual(encoder.reset(), None)
79
80 def test_stateful(self):
81 # jisx0213 encoder is stateful for a few codepoints. eg)
82 # U+00E6 => A9DC
83 # U+00E6 U+0300 => ABC4
84 # U+0300 => ABDC
85
86 encoder = codecs.getincrementalencoder('jisx0213')()
87 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
88 self.assertEqual(encoder.encode(u'\u00e6'), '')
89 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
90 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
91
92 self.assertEqual(encoder.reset(), None)
93 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
94
95 self.assertEqual(encoder.encode(u'\u00e6'), '')
96 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
97 self.assertEqual(encoder.encode('', True), '')
98
99 def test_stateful_keep_buffer(self):
100 encoder = codecs.getincrementalencoder('jisx0213')()
101 self.assertEqual(encoder.encode(u'\u00e6'), '')
102 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
103 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
104 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
105 self.assertEqual(encoder.reset(), None)
106 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
107 self.assertEqual(encoder.encode(u'\u00e6'), '')
108 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
109 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
110
111
112class Test_IncrementalDecoder(unittest.TestCase):
113
114 def test_dbcs(self):
115 # cp949 decoder is simple with only 1 or 2 bytes sequences.
116 decoder = codecs.getincrementaldecoder('cp949')()
117 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
118 u'\ud30c\uc774')
119 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
120 u'\uc36c \ub9c8\uc744')
121 self.assertEqual(decoder.decode(''), u'')
122
123 def test_dbcs_keep_buffer(self):
124 decoder = codecs.getincrementaldecoder('cp949')()
125 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
126 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
127 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
128
129 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
130 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
131 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
132
133 def test_iso2022(self):
134 decoder = codecs.getincrementaldecoder('iso2022-jp')()
135 ESC = '\x1b'
136 self.assertEqual(decoder.decode(ESC + '('), u'')
137 self.assertEqual(decoder.decode('B', True), u'')
138 self.assertEqual(decoder.decode(ESC + '$'), u'')
139 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
140 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
141 self.assertEqual(decoder.decode('$', True), u'\u4e16')
142 self.assertEqual(decoder.reset(), None)
143 self.assertEqual(decoder.decode('@$'), u'@$')
144 self.assertEqual(decoder.decode(ESC + '$'), u'')
145 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
146 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
147
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000148class Test_StreamReader(unittest.TestCase):
149 def test_bug1728403(self):
150 try:
151 open(TESTFN, 'w').write('\xa1')
152 f = codecs.open(TESTFN, encoding='cp949')
153 self.assertRaises(UnicodeDecodeError, f.read, 2)
154 finally:
Hye-Shik Changf9a0ea82007-06-05 19:28:15 +0000155 try: f.close()
156 except: pass
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000157 os.unlink(TESTFN)
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000158
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000159class Test_StreamWriter(unittest.TestCase):
160 if len(u'\U00012345') == 2: # UCS2
161 def test_gb18030(self):
Hye-Shik Chang9b541402007-06-05 18:58:51 +0000162 s = StringIO.StringIO()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000163 c = codecs.getwriter('gb18030')(s)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000164 c.write(u'123')
165 self.assertEqual(s.getvalue(), '123')
166 c.write(u'\U00012345')
167 self.assertEqual(s.getvalue(), '123\x907\x959')
168 c.write(u'\U00012345'[0])
169 self.assertEqual(s.getvalue(), '123\x907\x959')
170 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
171 self.assertEqual(s.getvalue(),
172 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
173 c.write(u'\U00012345'[0])
174 self.assertEqual(s.getvalue(),
175 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
176 self.assertRaises(UnicodeError, c.reset)
177 self.assertEqual(s.getvalue(),
178 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
179
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000180 def test_utf_8(self):
181 s= StringIO.StringIO()
182 c = codecs.getwriter('utf-8')(s)
183 c.write(u'123')
184 self.assertEqual(s.getvalue(), '123')
185 c.write(u'\U00012345')
186 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
187
188 # Python utf-8 codec can't buffer surrogate pairs yet.
189 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000190 c.write(u'\U00012345'[0])
191 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
192 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
193 self.assertEqual(s.getvalue(),
194 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
195 '\xea\xb0\x80\xc2\xac')
196 c.write(u'\U00012345'[0])
197 self.assertEqual(s.getvalue(),
198 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
199 '\xea\xb0\x80\xc2\xac')
200 c.reset()
201 self.assertEqual(s.getvalue(),
202 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
203 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
204 c.write(u'\U00012345'[1])
205 self.assertEqual(s.getvalue(),
206 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
207 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
208
209 else: # UCS4
210 pass
211
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000212 def test_streamwriter_strwrite(self):
213 s = StringIO.StringIO()
214 wr = codecs.getwriter('gb18030')(s)
215 wr.write('abcd')
216 self.assertEqual(s.getvalue(), 'abcd')
217
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000218class Test_ISO2022(unittest.TestCase):
219 def test_g2(self):
220 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
221 uni = u':hu4:unit\xe9 de famille'
222 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
223
Hye-Shik Chang199f1db2006-09-05 12:07:09 +0000224 def test_iso2022_jp_g0(self):
225 self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
226 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
227 e = u'\u3406'.encode(encoding)
228 self.failIf(filter(lambda x: x >= '\x80', e))
229
Hye-Shik Changb7883462006-10-08 13:48:34 +0000230 def test_bug1572832(self):
231 if sys.maxunicode >= 0x10000:
232 myunichr = unichr
233 else:
234 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
235
236 for x in xrange(0x10000, 0x110000):
237 # Any ISO 2022 codec will cause the segfault
238 myunichr(x).encode('iso_2022_jp', 'ignore')
239
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000240def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000241 test_support.run_unittest(__name__)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000242
243if __name__ == "__main__":
244 test_main()