blob: 4d02dee6973bfa4df68664aa323749451be8e49e [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001#!/usr/bin/env python
2#
3# test_multibytecodec.py
4# Unit test for multibytecodec itself
5#
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00006# $CJKCodecs: test_multibytecodec.py,v 1.8 2004/06/19 06:09:55 perky Exp $
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00007
8from test import test_support
9from test import test_multibytecodec_support
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000010import unittest, StringIO, codecs, sys
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +000011
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000012class Test_MultibyteCodec(unittest.TestCase):
13
14 def test_nullcoding(self):
15 self.assertEqual(''.decode('gb18030'), u'')
16 self.assertEqual(unicode('', 'gb18030'), u'')
17 self.assertEqual(u''.encode('gb18030'), '')
18
19 def test_str_decode(self):
20 self.assertEqual('abcd'.encode('gb18030'), 'abcd')
21
Hye-Shik Chang9f4b6322006-03-26 06:21:34 +000022 def test_errorcallback_longindex(self):
23 dec = codecs.getdecoder('euc-kr')
24 myreplace = lambda exc: (u'', sys.maxint+1)
25 codecs.register_error('test.cjktest', myreplace)
26 self.assertRaises(IndexError, dec,
27 'apple\x92ham\x93spam', 'test.cjktest')
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +000028
29class Test_IncrementalEncoder(unittest.TestCase):
30
31 def test_stateless(self):
32 # cp949 encoder isn't stateful at all.
33 encoder = codecs.getincrementalencoder('cp949')()
34 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
35 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
36 self.assertEqual(encoder.reset(), None)
37 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
38 '\xa1\xd9\xa1\xad\xa1\xd9')
39 self.assertEqual(encoder.reset(), None)
40 self.assertEqual(encoder.encode(u'', True), '')
41 self.assertEqual(encoder.encode(u'', False), '')
42 self.assertEqual(encoder.reset(), None)
43
44 def test_stateful(self):
45 # jisx0213 encoder is stateful for a few codepoints. eg)
46 # U+00E6 => A9DC
47 # U+00E6 U+0300 => ABC4
48 # U+0300 => ABDC
49
50 encoder = codecs.getincrementalencoder('jisx0213')()
51 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
52 self.assertEqual(encoder.encode(u'\u00e6'), '')
53 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
54 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
55
56 self.assertEqual(encoder.reset(), None)
57 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
58
59 self.assertEqual(encoder.encode(u'\u00e6'), '')
60 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
61 self.assertEqual(encoder.encode('', True), '')
62
63 def test_stateful_keep_buffer(self):
64 encoder = codecs.getincrementalencoder('jisx0213')()
65 self.assertEqual(encoder.encode(u'\u00e6'), '')
66 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
67 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
68 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
69 self.assertEqual(encoder.reset(), None)
70 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
71 self.assertEqual(encoder.encode(u'\u00e6'), '')
72 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
73 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
74
75
76class Test_IncrementalDecoder(unittest.TestCase):
77
78 def test_dbcs(self):
79 # cp949 decoder is simple with only 1 or 2 bytes sequences.
80 decoder = codecs.getincrementaldecoder('cp949')()
81 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
82 u'\ud30c\uc774')
83 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
84 u'\uc36c \ub9c8\uc744')
85 self.assertEqual(decoder.decode(''), u'')
86
87 def test_dbcs_keep_buffer(self):
88 decoder = codecs.getincrementaldecoder('cp949')()
89 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
90 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
91 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
92
93 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
94 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
95 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
96
97 def test_iso2022(self):
98 decoder = codecs.getincrementaldecoder('iso2022-jp')()
99 ESC = '\x1b'
100 self.assertEqual(decoder.decode(ESC + '('), u'')
101 self.assertEqual(decoder.decode('B', True), u'')
102 self.assertEqual(decoder.decode(ESC + '$'), u'')
103 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
104 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
105 self.assertEqual(decoder.decode('$', True), u'\u4e16')
106 self.assertEqual(decoder.reset(), None)
107 self.assertEqual(decoder.decode('@$'), u'@$')
108 self.assertEqual(decoder.decode(ESC + '$'), u'')
109 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
110 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
111
112
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000113class Test_StreamWriter(unittest.TestCase):
114 if len(u'\U00012345') == 2: # UCS2
115 def test_gb18030(self):
116 s= StringIO.StringIO()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000117 c = codecs.getwriter('gb18030')(s)
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000118 c.write(u'123')
119 self.assertEqual(s.getvalue(), '123')
120 c.write(u'\U00012345')
121 self.assertEqual(s.getvalue(), '123\x907\x959')
122 c.write(u'\U00012345'[0])
123 self.assertEqual(s.getvalue(), '123\x907\x959')
124 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
125 self.assertEqual(s.getvalue(),
126 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
127 c.write(u'\U00012345'[0])
128 self.assertEqual(s.getvalue(),
129 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
130 self.assertRaises(UnicodeError, c.reset)
131 self.assertEqual(s.getvalue(),
132 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
133
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000134 def test_utf_8(self):
135 s= StringIO.StringIO()
136 c = codecs.getwriter('utf-8')(s)
137 c.write(u'123')
138 self.assertEqual(s.getvalue(), '123')
139 c.write(u'\U00012345')
140 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
141
142 # Python utf-8 codec can't buffer surrogate pairs yet.
143 if 0:
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000144 c.write(u'\U00012345'[0])
145 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
146 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
147 self.assertEqual(s.getvalue(),
148 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
149 '\xea\xb0\x80\xc2\xac')
150 c.write(u'\U00012345'[0])
151 self.assertEqual(s.getvalue(),
152 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
153 '\xea\xb0\x80\xc2\xac')
154 c.reset()
155 self.assertEqual(s.getvalue(),
156 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
157 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
158 c.write(u'\U00012345'[1])
159 self.assertEqual(s.getvalue(),
160 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
161 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
162
163 else: # UCS4
164 pass
165
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000166 def test_streamwriter_strwrite(self):
167 s = StringIO.StringIO()
168 wr = codecs.getwriter('gb18030')(s)
169 wr.write('abcd')
170 self.assertEqual(s.getvalue(), 'abcd')
171
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000172class Test_ISO2022(unittest.TestCase):
173 def test_g2(self):
174 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
175 uni = u':hu4:unit\xe9 de famille'
176 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
177
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000178def test_main():
179 suite = unittest.TestSuite()
Hye-Shik Change2ac4ab2006-03-26 02:34:59 +0000180 suite.addTest(unittest.makeSuite(Test_MultibyteCodec))
181 suite.addTest(unittest.makeSuite(Test_IncrementalEncoder))
182 suite.addTest(unittest.makeSuite(Test_IncrementalDecoder))
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000183 suite.addTest(unittest.makeSuite(Test_StreamWriter))
Hye-Shik Changabb903f2006-03-13 10:20:08 +0000184 suite.addTest(unittest.makeSuite(Test_ISO2022))
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000185 test_support.run_suite(suite)
186
187if __name__ == "__main__":
188 test_main()