| /* |
| * iso2022common.h: Common Codec Routines for ISO-2022 codecs. |
| * |
| * Written by Hye-Shik Chang <perky@FreeBSD.org> |
| * $CJKCodecs: iso2022common.h,v 1.8 2003/12/31 05:46:55 perky Exp $ |
| */ |
| |
| /* This ISO-2022 implementation is intended to comply ECMA-43 Level 1 |
| * rather than RFCs itself */ |
| |
| #define ESC 0x1b |
| #define SO 0x0e |
| #define SI 0x0f |
| |
| #define MAX_ESCSEQLEN 16 |
| |
| #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') |
| #define IS_ISO2022ESC(c2) ((c2) == '(' || (c2) == ')' || (c2) == '$' || \ |
| (c2) == '.' || (c2) == '&') |
| /* this is not a full list of ISO-2022 escape sequence headers. |
| * but, it's enough to implement CJK instances of iso-2022. */ |
| |
| /* STATE |
| |
| state->c[0-3] |
| |
| 00000000 |
| ||^^^^^| |
| |+-----+---- G0-3 Character Set |
| +----------- Is G0-3 double byte? |
| |
| state->c[4] |
| |
| 00000000 |
| || |
| |+---- Locked-Shift? |
| +----- ESC Throughout |
| */ |
| |
| #define CHARSET_DOUBLEBYTE 0x80 |
| |
| #define CHARSET_ASCII 'B' |
| |
| #define CHARSET_ISO8859_1 'A' |
| #define CHARSET_ISO8859_7 'F' |
| |
| #define CHARSET_KSX1001 ('C'|CHARSET_DOUBLEBYTE) |
| |
| #define CHARSET_JISX0201_R 'J' |
| #define CHARSET_JISX0201_K 'I' |
| #define CHARSET_JISX0208 ('B'|CHARSET_DOUBLEBYTE) |
| #define CHARSET_JISX0208_O ('@'|CHARSET_DOUBLEBYTE) |
| #define CHARSET_JISX0212 ('D'|CHARSET_DOUBLEBYTE) |
| #define CHARSET_JISX0213_1 ('O'|CHARSET_DOUBLEBYTE) |
| #define CHARSET_JISX0213_2 ('P'|CHARSET_DOUBLEBYTE) |
| |
| #define CHARSET_GB2312 ('A'|CHARSET_DOUBLEBYTE) |
| #define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE) |
| |
| #define CHARSET_DESIGN(c) ((c) & 0x7f) |
| #define CHARSET_ISDBCS(c) ((c) & 0x80) |
| |
| #define F_SHIFTED 0x01 |
| #define F_ESCTHROUGHOUT 0x02 |
| |
| #define STATE_SETG(dn, s, v) ((s)->c[dn]) = (v); |
| #define STATE_GETG(dn, s) ((s)->c[dn]) |
| |
| #define STATE_SETG0(s, v) STATE_SETG(0, s, v) |
| #define STATE_GETG0(s) STATE_GETG(0, s) |
| #define STATE_SETG1(s, v) STATE_SETG(1, s, v) |
| #define STATE_GETG1(s) STATE_GETG(1, s) |
| #define STATE_SETG2(s, v) STATE_SETG(2, s, v) |
| #define STATE_GETG2(s) STATE_GETG(2, s) |
| #define STATE_SETG3(s, v) STATE_SETG(3, s, v) |
| #define STATE_GETG3(s) STATE_GETG(3, s) |
| |
| #define STATE_SETFLAG(s, f) ((s)->c[4]) |= (f); |
| #define STATE_GETFLAG(s, f) ((s)->c[4] & (f)) |
| #define STATE_CLEARFLAG(s, f) ((s)->c[4]) &= ~(f); |
| #define STATE_CLEARFLAGS(s) ((s)->c[4]) = 0; |
| |
| #define ISO2022_GETCHARSET(charset, c1) \ |
| if ((c) >= 0x80) \ |
| return 1; \ |
| if (STATE_GETFLAG(state, F_SHIFTED)) /* G1 */ \ |
| (charset) = STATE_GETG1(state); \ |
| else /* G1 */ \ |
| (charset) = STATE_GETG0(state); \ |
| |
| #ifdef ISO2022_USE_G2_DESIGNATION |
| /* hardcoded for iso-2022-jp-2 for now. we'll need to generalize it |
| when we have more G2 designating encodings */ |
| #define SS2_ROUTINE \ |
| if (IN2 == 'N') { /* SS2 */ \ |
| RESERVE_INBUF(3) \ |
| if (STATE_GETG2(state) == CHARSET_ISO8859_1) { \ |
| ISO8859_1_DECODE(IN3 ^ 0x80, **outbuf) \ |
| else return 3; \ |
| } else if (STATE_GETG2(state) == CHARSET_ISO8859_7) { \ |
| ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf) \ |
| else return 3; \ |
| } else if (STATE_GETG2(state) == CHARSET_ASCII) { \ |
| if (IN3 & 0x80) return 3; \ |
| else **outbuf = IN3; \ |
| } else \ |
| return MBERR_INTERNAL; \ |
| NEXT(3, 1) \ |
| } else |
| #else |
| #define SS2_ROUTINE |
| #endif |
| |
| #ifndef ISO2022_NO_SHIFT |
| #define SHIFT_CASES \ |
| case SI: \ |
| STATE_CLEARFLAG(state, F_SHIFTED) \ |
| NEXT_IN(1) \ |
| break; \ |
| case SO: \ |
| STATE_SETFLAG(state, F_SHIFTED) \ |
| NEXT_IN(1) \ |
| break; |
| #else |
| /* for compatibility with JapaneseCodecs */ |
| #define SHIFT_CASES |
| #endif |
| |
| #define ISO2022_BASECASES(c1) \ |
| case ESC: \ |
| RESERVE_INBUF(2) \ |
| if (IS_ISO2022ESC(IN2)) { \ |
| int err; \ |
| err = iso2022processesc(state, inbuf, &inleft); \ |
| if (err != 0) \ |
| return err; \ |
| } else SS2_ROUTINE { \ |
| STATE_SETFLAG(state, F_ESCTHROUGHOUT) \ |
| OUT1(ESC) \ |
| NEXT(1, 1) \ |
| } \ |
| break; \ |
| SHIFT_CASES \ |
| case '\n': \ |
| STATE_CLEARFLAG(state, F_SHIFTED) \ |
| WRITE1('\n') \ |
| NEXT(1, 1) \ |
| break; |
| |
| #define ISO2022_ESCTHROUGHOUT(c) \ |
| if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) { \ |
| /* ESC throughout mode: for non-iso2022 escape sequences */ \ |
| RESERVE_OUTBUF(1) \ |
| OUT1(c) /* assume as ISO-8859-1 */ \ |
| NEXT(1, 1) \ |
| if (IS_ESCEND(c)) { \ |
| STATE_CLEARFLAG(state, F_ESCTHROUGHOUT) \ |
| } \ |
| continue; \ |
| } |
| |
| #define ISO2022_LOOP_BEGIN \ |
| while (inleft > 0) { \ |
| unsigned char c = IN1; \ |
| ISO2022_ESCTHROUGHOUT(c) \ |
| switch(c) { \ |
| ISO2022_BASECASES(c) \ |
| default: \ |
| if (c < 0x20) { /* C0 */ \ |
| RESERVE_OUTBUF(1) \ |
| OUT1(c) \ |
| NEXT(1, 1) \ |
| } else if (c >= 0x80) \ |
| return 1; \ |
| else { |
| #define ISO2022_LOOP_END \ |
| } \ |
| } \ |
| } |
| |
| static int |
| iso2022processesc(MultibyteCodec_State *state, |
| const unsigned char **inbuf, size_t *inleft) |
| { |
| unsigned char charset, designation; |
| size_t i, esclen; |
| |
| for (i = 1;i < MAX_ESCSEQLEN;i++) { |
| if (i >= *inleft) |
| return MBERR_TOOFEW; |
| if (IS_ESCEND((*inbuf)[i])) { |
| esclen = i + 1; |
| break; |
| } |
| #ifdef ISO2022_USE_JISX0208EXT |
| else if (i+1 < *inleft && (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') |
| i += 2; |
| #endif |
| } |
| |
| if (i >= MAX_ESCSEQLEN) |
| return 1; /* unterminated escape sequence */ |
| |
| switch (esclen) { |
| case 3: |
| if (IN2 == '$') { |
| charset = IN3 | CHARSET_DOUBLEBYTE; |
| designation = 0; |
| } else { |
| charset = IN3; |
| if (IN2 == '(') designation = 0; |
| else if (IN2 == ')') designation = 1; |
| #ifdef ISO2022_USE_G2_DESIGNATION |
| else if (IN2 == '.') designation = 2; |
| #endif |
| else return 3; |
| } |
| break; |
| case 4: |
| if (IN2 != '$') |
| return 4; |
| |
| charset = IN4 | CHARSET_DOUBLEBYTE; |
| if (IN3 == '(') designation = 0; |
| else if (IN3 == ')') designation = 1; |
| else return 4; |
| break; |
| #ifdef ISO2022_USE_JISX0208EXT |
| case 6: /* designation with prefix */ |
| if ((*inbuf)[3] == ESC && (*inbuf)[4] == '$' && (*inbuf)[5] == 'B') { |
| charset = 'B' | CHARSET_DOUBLEBYTE; |
| designation = 0; |
| } else |
| return 6; |
| break; |
| #endif |
| default: |
| return esclen; |
| } |
| |
| { /* raise error when the charset is not designated for this encoding */ |
| const unsigned char dsgs[] = {ISO2022_DESIGNATIONS, '\x00'}; |
| |
| for (i = 0; dsgs[i] != '\x00'; i++) |
| if (dsgs[i] == charset) |
| break; |
| |
| if (dsgs[i] == '\x00') |
| return esclen; |
| } |
| |
| STATE_SETG(designation, state, charset) |
| *inleft -= esclen; |
| (*inbuf) += esclen; |
| return 0; |
| } |