Hye-Shik Chang | 3e2a306 | 2004-01-17 14:29:29 +0000 | [diff] [blame] | 1 | /* |
| 2 | * iso2022common.h: Common Codec Routines for ISO-2022 codecs. |
| 3 | * |
| 4 | * Written by Hye-Shik Chang <perky@FreeBSD.org> |
| 5 | * $CJKCodecs: iso2022common.h,v 1.8 2003/12/31 05:46:55 perky Exp $ |
| 6 | */ |
| 7 | |
| 8 | /* This ISO-2022 implementation is intended to comply ECMA-43 Level 1 |
| 9 | * rather than RFCs itself */ |
| 10 | |
| 11 | #define ESC 0x1b |
| 12 | #define SO 0x0e |
| 13 | #define SI 0x0f |
| 14 | |
| 15 | #define MAX_ESCSEQLEN 16 |
| 16 | |
| 17 | #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') |
| 18 | #define IS_ISO2022ESC(c2) ((c2) == '(' || (c2) == ')' || (c2) == '$' || \ |
| 19 | (c2) == '.' || (c2) == '&') |
| 20 | /* this is not a full list of ISO-2022 escape sequence headers. |
| 21 | * but, it's enough to implement CJK instances of iso-2022. */ |
| 22 | |
| 23 | /* STATE |
| 24 | |
| 25 | state->c[0-3] |
| 26 | |
| 27 | 00000000 |
| 28 | ||^^^^^| |
| 29 | |+-----+---- G0-3 Character Set |
| 30 | +----------- Is G0-3 double byte? |
| 31 | |
| 32 | state->c[4] |
| 33 | |
| 34 | 00000000 |
| 35 | || |
| 36 | |+---- Locked-Shift? |
| 37 | +----- ESC Throughout |
| 38 | */ |
| 39 | |
| 40 | #define CHARSET_DOUBLEBYTE 0x80 |
| 41 | |
| 42 | #define CHARSET_ASCII 'B' |
| 43 | |
| 44 | #define CHARSET_ISO8859_1 'A' |
| 45 | #define CHARSET_ISO8859_7 'F' |
| 46 | |
| 47 | #define CHARSET_KSX1001 ('C'|CHARSET_DOUBLEBYTE) |
| 48 | |
| 49 | #define CHARSET_JISX0201_R 'J' |
| 50 | #define CHARSET_JISX0201_K 'I' |
| 51 | #define CHARSET_JISX0208 ('B'|CHARSET_DOUBLEBYTE) |
| 52 | #define CHARSET_JISX0208_O ('@'|CHARSET_DOUBLEBYTE) |
| 53 | #define CHARSET_JISX0212 ('D'|CHARSET_DOUBLEBYTE) |
| 54 | #define CHARSET_JISX0213_1 ('O'|CHARSET_DOUBLEBYTE) |
| 55 | #define CHARSET_JISX0213_2 ('P'|CHARSET_DOUBLEBYTE) |
| 56 | |
| 57 | #define CHARSET_GB2312 ('A'|CHARSET_DOUBLEBYTE) |
| 58 | #define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE) |
| 59 | |
| 60 | #define CHARSET_DESIGN(c) ((c) & 0x7f) |
| 61 | #define CHARSET_ISDBCS(c) ((c) & 0x80) |
| 62 | |
| 63 | #define F_SHIFTED 0x01 |
| 64 | #define F_ESCTHROUGHOUT 0x02 |
| 65 | |
| 66 | #define STATE_SETG(dn, s, v) ((s)->c[dn]) = (v); |
| 67 | #define STATE_GETG(dn, s) ((s)->c[dn]) |
| 68 | |
| 69 | #define STATE_SETG0(s, v) STATE_SETG(0, s, v) |
| 70 | #define STATE_GETG0(s) STATE_GETG(0, s) |
| 71 | #define STATE_SETG1(s, v) STATE_SETG(1, s, v) |
| 72 | #define STATE_GETG1(s) STATE_GETG(1, s) |
| 73 | #define STATE_SETG2(s, v) STATE_SETG(2, s, v) |
| 74 | #define STATE_GETG2(s) STATE_GETG(2, s) |
| 75 | #define STATE_SETG3(s, v) STATE_SETG(3, s, v) |
| 76 | #define STATE_GETG3(s) STATE_GETG(3, s) |
| 77 | |
| 78 | #define STATE_SETFLAG(s, f) ((s)->c[4]) |= (f); |
| 79 | #define STATE_GETFLAG(s, f) ((s)->c[4] & (f)) |
| 80 | #define STATE_CLEARFLAG(s, f) ((s)->c[4]) &= ~(f); |
| 81 | #define STATE_CLEARFLAGS(s) ((s)->c[4]) = 0; |
| 82 | |
| 83 | #define ISO2022_GETCHARSET(charset, c1) \ |
| 84 | if ((c) >= 0x80) \ |
| 85 | return 1; \ |
| 86 | if (STATE_GETFLAG(state, F_SHIFTED)) /* G1 */ \ |
| 87 | (charset) = STATE_GETG1(state); \ |
| 88 | else /* G1 */ \ |
| 89 | (charset) = STATE_GETG0(state); \ |
| 90 | |
| 91 | #ifdef ISO2022_USE_G2_DESIGNATION |
| 92 | /* hardcoded for iso-2022-jp-2 for now. we'll need to generalize it |
| 93 | when we have more G2 designating encodings */ |
| 94 | #define SS2_ROUTINE \ |
| 95 | if (IN2 == 'N') { /* SS2 */ \ |
| 96 | RESERVE_INBUF(3) \ |
| 97 | if (STATE_GETG2(state) == CHARSET_ISO8859_1) { \ |
| 98 | ISO8859_1_DECODE(IN3 ^ 0x80, **outbuf) \ |
| 99 | else return 3; \ |
| 100 | } else if (STATE_GETG2(state) == CHARSET_ISO8859_7) { \ |
| 101 | ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf) \ |
| 102 | else return 3; \ |
| 103 | } else if (STATE_GETG2(state) == CHARSET_ASCII) { \ |
| 104 | if (IN3 & 0x80) return 3; \ |
| 105 | else **outbuf = IN3; \ |
| 106 | } else \ |
| 107 | return MBERR_INTERNAL; \ |
| 108 | NEXT(3, 1) \ |
| 109 | } else |
| 110 | #else |
| 111 | #define SS2_ROUTINE |
| 112 | #endif |
| 113 | |
| 114 | #ifndef ISO2022_NO_SHIFT |
| 115 | #define SHIFT_CASES \ |
| 116 | case SI: \ |
| 117 | STATE_CLEARFLAG(state, F_SHIFTED) \ |
| 118 | NEXT_IN(1) \ |
| 119 | break; \ |
| 120 | case SO: \ |
| 121 | STATE_SETFLAG(state, F_SHIFTED) \ |
| 122 | NEXT_IN(1) \ |
| 123 | break; |
| 124 | #else |
| 125 | /* for compatibility with JapaneseCodecs */ |
| 126 | #define SHIFT_CASES |
| 127 | #endif |
| 128 | |
| 129 | #define ISO2022_BASECASES(c1) \ |
| 130 | case ESC: \ |
| 131 | RESERVE_INBUF(2) \ |
| 132 | if (IS_ISO2022ESC(IN2)) { \ |
| 133 | int err; \ |
| 134 | err = iso2022processesc(state, inbuf, &inleft); \ |
| 135 | if (err != 0) \ |
| 136 | return err; \ |
| 137 | } else SS2_ROUTINE { \ |
| 138 | STATE_SETFLAG(state, F_ESCTHROUGHOUT) \ |
| 139 | OUT1(ESC) \ |
| 140 | NEXT(1, 1) \ |
| 141 | } \ |
| 142 | break; \ |
| 143 | SHIFT_CASES \ |
| 144 | case '\n': \ |
| 145 | STATE_CLEARFLAG(state, F_SHIFTED) \ |
| 146 | WRITE1('\n') \ |
| 147 | NEXT(1, 1) \ |
| 148 | break; |
| 149 | |
| 150 | #define ISO2022_ESCTHROUGHOUT(c) \ |
| 151 | if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { \ |
| 152 | /* ESC throughout mode: for non-iso2022 escape sequences */ \ |
| 153 | RESERVE_OUTBUF(1) \ |
| 154 | OUT1(c) /* assume as ISO-8859-1 */ \ |
| 155 | NEXT(1, 1) \ |
| 156 | if (IS_ESCEND(c)) { \ |
| 157 | STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) \ |
| 158 | } \ |
| 159 | continue; \ |
| 160 | } |
| 161 | |
| 162 | #define ISO2022_LOOP_BEGIN \ |
| 163 | while (inleft > 0) { \ |
| 164 | unsigned char c = IN1; \ |
| 165 | ISO2022_ESCTHROUGHOUT(c) \ |
| 166 | switch(c) { \ |
| 167 | ISO2022_BASECASES(c) \ |
| 168 | default: \ |
| 169 | if (c < 0x20) { /* C0 */ \ |
| 170 | RESERVE_OUTBUF(1) \ |
| 171 | OUT1(c) \ |
| 172 | NEXT(1, 1) \ |
| 173 | } else if (c >= 0x80) \ |
| 174 | return 1; \ |
| 175 | else { |
| 176 | #define ISO2022_LOOP_END \ |
| 177 | } \ |
| 178 | } \ |
| 179 | } |
| 180 | |
| 181 | static int |
| 182 | iso2022processesc(MultibyteCodec_State *state, |
| 183 | const unsigned char **inbuf, size_t *inleft) |
| 184 | { |
| 185 | unsigned char charset, designation; |
Hye-Shik Chang | d210a5b | 2004-01-23 14:36:17 +0000 | [diff] [blame^] | 186 | size_t i, esclen; |
Hye-Shik Chang | 3e2a306 | 2004-01-17 14:29:29 +0000 | [diff] [blame] | 187 | |
| 188 | for (i = 1;i < MAX_ESCSEQLEN;i++) { |
| 189 | if (i >= *inleft) |
| 190 | return MBERR_TOOFEW; |
| 191 | if (IS_ESCEND((*inbuf)[i])) { |
| 192 | esclen = i + 1; |
| 193 | break; |
| 194 | } |
| 195 | #ifdef ISO2022_USE_JISX0208EXT |
| 196 | else if (i+1 < *inleft && (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') |
| 197 | i += 2; |
| 198 | #endif |
| 199 | } |
| 200 | |
| 201 | if (i >= MAX_ESCSEQLEN) |
| 202 | return 1; /* unterminated escape sequence */ |
| 203 | |
| 204 | switch (esclen) { |
| 205 | case 3: |
| 206 | if (IN2 == '$') { |
| 207 | charset = IN3 | CHARSET_DOUBLEBYTE; |
| 208 | designation = 0; |
| 209 | } else { |
| 210 | charset = IN3; |
| 211 | if (IN2 == '(') designation = 0; |
| 212 | else if (IN2 == ')') designation = 1; |
| 213 | #ifdef ISO2022_USE_G2_DESIGNATION |
| 214 | else if (IN2 == '.') designation = 2; |
| 215 | #endif |
| 216 | else return 3; |
| 217 | } |
| 218 | break; |
| 219 | case 4: |
| 220 | if (IN2 != '$') |
| 221 | return 4; |
| 222 | |
| 223 | charset = IN4 | CHARSET_DOUBLEBYTE; |
| 224 | if (IN3 == '(') designation = 0; |
| 225 | else if (IN3 == ')') designation = 1; |
| 226 | else return 4; |
| 227 | break; |
| 228 | #ifdef ISO2022_USE_JISX0208EXT |
| 229 | case 6: /* designation with prefix */ |
| 230 | if ((*inbuf)[3] == ESC && (*inbuf)[4] == '$' && (*inbuf)[5] == 'B') { |
| 231 | charset = 'B' | CHARSET_DOUBLEBYTE; |
| 232 | designation = 0; |
| 233 | } else |
| 234 | return 6; |
| 235 | break; |
| 236 | #endif |
| 237 | default: |
| 238 | return esclen; |
| 239 | } |
| 240 | |
| 241 | { /* raise error when the charset is not designated for this encoding */ |
| 242 | const unsigned char dsgs[] = {ISO2022_DESIGNATIONS, '\x00'}; |
| 243 | |
| 244 | for (i = 0; dsgs[i] != '\x00'; i++) |
| 245 | if (dsgs[i] == charset) |
| 246 | break; |
| 247 | |
| 248 | if (dsgs[i] == '\x00') |
| 249 | return esclen; |
| 250 | } |
| 251 | |
| 252 | STATE_SETG(designation, state, charset) |
| 253 | *inleft -= esclen; |
| 254 | (*inbuf) += esclen; |
| 255 | return 0; |
| 256 | } |