blob: e042d8033d34d39ed87f764754d5da57330a44ee [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001/*
2 * iso2022common.h: Common Codec Routines for ISO-2022 codecs.
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 * $CJKCodecs: iso2022common.h,v 1.8 2003/12/31 05:46:55 perky Exp $
6 */
7
8/* This ISO-2022 implementation is intended to comply ECMA-43 Level 1
9 * rather than RFCs itself */
10
11#define ESC 0x1b
12#define SO 0x0e
13#define SI 0x0f
14
15#define MAX_ESCSEQLEN 16
16
17#define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
18#define IS_ISO2022ESC(c2) ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
19 (c2) == '.' || (c2) == '&')
20 /* this is not a full list of ISO-2022 escape sequence headers.
21 * but, it's enough to implement CJK instances of iso-2022. */
22
23/* STATE
24
25 state->c[0-3]
26
27 00000000
28 ||^^^^^|
29 |+-----+---- G0-3 Character Set
30 +----------- Is G0-3 double byte?
31
32 state->c[4]
33
34 00000000
35 ||
36 |+---- Locked-Shift?
37 +----- ESC Throughout
38*/
39
40#define CHARSET_DOUBLEBYTE 0x80
41
42#define CHARSET_ASCII 'B'
43
44#define CHARSET_ISO8859_1 'A'
45#define CHARSET_ISO8859_7 'F'
46
47#define CHARSET_KSX1001 ('C'|CHARSET_DOUBLEBYTE)
48
49#define CHARSET_JISX0201_R 'J'
50#define CHARSET_JISX0201_K 'I'
51#define CHARSET_JISX0208 ('B'|CHARSET_DOUBLEBYTE)
52#define CHARSET_JISX0208_O ('@'|CHARSET_DOUBLEBYTE)
53#define CHARSET_JISX0212 ('D'|CHARSET_DOUBLEBYTE)
54#define CHARSET_JISX0213_1 ('O'|CHARSET_DOUBLEBYTE)
55#define CHARSET_JISX0213_2 ('P'|CHARSET_DOUBLEBYTE)
56
57#define CHARSET_GB2312 ('A'|CHARSET_DOUBLEBYTE)
58#define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE)
59
60#define CHARSET_DESIGN(c) ((c) & 0x7f)
61#define CHARSET_ISDBCS(c) ((c) & 0x80)
62
63#define F_SHIFTED 0x01
64#define F_ESCTHROUGHOUT 0x02
65
66#define STATE_SETG(dn, s, v) ((s)->c[dn]) = (v);
67#define STATE_GETG(dn, s) ((s)->c[dn])
68
69#define STATE_SETG0(s, v) STATE_SETG(0, s, v)
70#define STATE_GETG0(s) STATE_GETG(0, s)
71#define STATE_SETG1(s, v) STATE_SETG(1, s, v)
72#define STATE_GETG1(s) STATE_GETG(1, s)
73#define STATE_SETG2(s, v) STATE_SETG(2, s, v)
74#define STATE_GETG2(s) STATE_GETG(2, s)
75#define STATE_SETG3(s, v) STATE_SETG(3, s, v)
76#define STATE_GETG3(s) STATE_GETG(3, s)
77
78#define STATE_SETFLAG(s, f) ((s)->c[4]) |= (f);
79#define STATE_GETFLAG(s, f) ((s)->c[4] & (f))
80#define STATE_CLEARFLAG(s, f) ((s)->c[4]) &= ~(f);
81#define STATE_CLEARFLAGS(s) ((s)->c[4]) = 0;
82
83#define ISO2022_GETCHARSET(charset, c1) \
84 if ((c) >= 0x80) \
85 return 1; \
86 if (STATE_GETFLAG(state, F_SHIFTED)) /* G1 */ \
87 (charset) = STATE_GETG1(state); \
88 else /* G1 */ \
89 (charset) = STATE_GETG0(state); \
90
91#ifdef ISO2022_USE_G2_DESIGNATION
92/* hardcoded for iso-2022-jp-2 for now. we'll need to generalize it
93 when we have more G2 designating encodings */
94#define SS2_ROUTINE \
95 if (IN2 == 'N') { /* SS2 */ \
96 RESERVE_INBUF(3) \
97 if (STATE_GETG2(state) == CHARSET_ISO8859_1) { \
98 ISO8859_1_DECODE(IN3 ^ 0x80, **outbuf) \
99 else return 3; \
100 } else if (STATE_GETG2(state) == CHARSET_ISO8859_7) { \
101 ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf) \
102 else return 3; \
103 } else if (STATE_GETG2(state) == CHARSET_ASCII) { \
104 if (IN3 & 0x80) return 3; \
105 else **outbuf = IN3; \
106 } else \
107 return MBERR_INTERNAL; \
108 NEXT(3, 1) \
109 } else
110#else
111#define SS2_ROUTINE
112#endif
113
114#ifndef ISO2022_NO_SHIFT
115#define SHIFT_CASES \
116 case SI: \
117 STATE_CLEARFLAG(state, F_SHIFTED) \
118 NEXT_IN(1) \
119 break; \
120 case SO: \
121 STATE_SETFLAG(state, F_SHIFTED) \
122 NEXT_IN(1) \
123 break;
124#else
125/* for compatibility with JapaneseCodecs */
126#define SHIFT_CASES
127#endif
128
129#define ISO2022_BASECASES(c1) \
130 case ESC: \
131 RESERVE_INBUF(2) \
132 if (IS_ISO2022ESC(IN2)) { \
133 int err; \
134 err = iso2022processesc(state, inbuf, &inleft); \
135 if (err != 0) \
136 return err; \
137 } else SS2_ROUTINE { \
138 STATE_SETFLAG(state, F_ESCTHROUGHOUT) \
139 OUT1(ESC) \
140 NEXT(1, 1) \
141 } \
142 break; \
143 SHIFT_CASES \
144 case '\n': \
145 STATE_CLEARFLAG(state, F_SHIFTED) \
146 WRITE1('\n') \
147 NEXT(1, 1) \
148 break;
149
150#define ISO2022_ESCTHROUGHOUT(c) \
151 if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { \
152 /* ESC throughout mode: for non-iso2022 escape sequences */ \
153 RESERVE_OUTBUF(1) \
154 OUT1(c) /* assume as ISO-8859-1 */ \
155 NEXT(1, 1) \
156 if (IS_ESCEND(c)) { \
157 STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) \
158 } \
159 continue; \
160 }
161
162#define ISO2022_LOOP_BEGIN \
163 while (inleft > 0) { \
164 unsigned char c = IN1; \
165 ISO2022_ESCTHROUGHOUT(c) \
166 switch(c) { \
167 ISO2022_BASECASES(c) \
168 default: \
169 if (c < 0x20) { /* C0 */ \
170 RESERVE_OUTBUF(1) \
171 OUT1(c) \
172 NEXT(1, 1) \
173 } else if (c >= 0x80) \
174 return 1; \
175 else {
176#define ISO2022_LOOP_END \
177 } \
178 } \
179 }
180
181static int
182iso2022processesc(MultibyteCodec_State *state,
183 const unsigned char **inbuf, size_t *inleft)
184{
185 unsigned char charset, designation;
Hye-Shik Changd210a5b2004-01-23 14:36:17 +0000186 size_t i, esclen;
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +0000187
188 for (i = 1;i < MAX_ESCSEQLEN;i++) {
189 if (i >= *inleft)
190 return MBERR_TOOFEW;
191 if (IS_ESCEND((*inbuf)[i])) {
192 esclen = i + 1;
193 break;
194 }
195#ifdef ISO2022_USE_JISX0208EXT
196 else if (i+1 < *inleft && (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
197 i += 2;
198#endif
199 }
200
201 if (i >= MAX_ESCSEQLEN)
202 return 1; /* unterminated escape sequence */
203
204 switch (esclen) {
205 case 3:
206 if (IN2 == '$') {
207 charset = IN3 | CHARSET_DOUBLEBYTE;
208 designation = 0;
209 } else {
210 charset = IN3;
211 if (IN2 == '(') designation = 0;
212 else if (IN2 == ')') designation = 1;
213#ifdef ISO2022_USE_G2_DESIGNATION
214 else if (IN2 == '.') designation = 2;
215#endif
216 else return 3;
217 }
218 break;
219 case 4:
220 if (IN2 != '$')
221 return 4;
222
223 charset = IN4 | CHARSET_DOUBLEBYTE;
224 if (IN3 == '(') designation = 0;
225 else if (IN3 == ')') designation = 1;
226 else return 4;
227 break;
228#ifdef ISO2022_USE_JISX0208EXT
229 case 6: /* designation with prefix */
230 if ((*inbuf)[3] == ESC && (*inbuf)[4] == '$' && (*inbuf)[5] == 'B') {
231 charset = 'B' | CHARSET_DOUBLEBYTE;
232 designation = 0;
233 } else
234 return 6;
235 break;
236#endif
237 default:
238 return esclen;
239 }
240
241 { /* raise error when the charset is not designated for this encoding */
242 const unsigned char dsgs[] = {ISO2022_DESIGNATIONS, '\x00'};
243
244 for (i = 0; dsgs[i] != '\x00'; i++)
245 if (dsgs[i] == charset)
246 break;
247
248 if (dsgs[i] == '\x00')
249 return esclen;
250 }
251
252 STATE_SETG(designation, state, charset)
253 *inleft -= esclen;
254 (*inbuf) += esclen;
255 return 0;
256}