Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks!

commit: 3e2a30692085d32ac63f72b35da39158a471fc68 [log] [tgz]
author: Hye-Shik Chang <hyeshik@gmail.com> Sat Jan 17 14:29:29 2004 +0000
committer: Hye-Shik Chang <hyeshik@gmail.com> Sat Jan 17 14:29:29 2004 +0000
tree: 4cbe735f61eae87ac56a13ca6bd32113b98bd03d
parent: cd1f7430cb8f48de970021071d7683054c23b10f [diff] [blame]
diff --git a/Modules/cjkcodecs/iso2022common.h b/Modules/cjkcodecs/iso2022common.h
new file mode 100644
index 0000000..cb8a159
--- /dev/null
+++ b/Modules/cjkcodecs/iso2022common.h

@@ -0,0 +1,256 @@
+/*
+ * iso2022common.h: Common Codec Routines for ISO-2022 codecs.
+ *
+ * Written by Hye-Shik Chang <perky@FreeBSD.org>
+ * $CJKCodecs: iso2022common.h,v 1.8 2003/12/31 05:46:55 perky Exp $
+ */
+
+/* This ISO-2022 implementation is intended to comply ECMA-43 Level 1
+ * rather than RFCs itself */
+
+#define ESC     0x1b
+#define SO      0x0e
+#define SI      0x0f
+
+#define MAX_ESCSEQLEN       16
+
+#define IS_ESCEND(c)        (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
+#define IS_ISO2022ESC(c2)   ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
+                             (c2) == '.' || (c2) == '&')
+        /* this is not a full list of ISO-2022 escape sequence headers.
+         * but, it's enough to implement CJK instances of iso-2022. */
+
+/* STATE
+
+  state->c[0-3]
+
+        00000000
+        ||^^^^^|
+        |+-----+----  G0-3 Character Set
+        +-----------  Is G0-3 double byte?
+
+  state->c[4]
+
+        00000000
+              ||
+              |+----  Locked-Shift?
+              +-----  ESC Throughout
+*/
+
+#define CHARSET_DOUBLEBYTE  0x80
+
+#define CHARSET_ASCII       'B'
+
+#define CHARSET_ISO8859_1   'A'
+#define CHARSET_ISO8859_7   'F'
+
+#define CHARSET_KSX1001     ('C'|CHARSET_DOUBLEBYTE)
+
+#define CHARSET_JISX0201_R  'J'
+#define CHARSET_JISX0201_K  'I'
+#define CHARSET_JISX0208    ('B'|CHARSET_DOUBLEBYTE)
+#define CHARSET_JISX0208_O  ('@'|CHARSET_DOUBLEBYTE)
+#define CHARSET_JISX0212    ('D'|CHARSET_DOUBLEBYTE)
+#define CHARSET_JISX0213_1  ('O'|CHARSET_DOUBLEBYTE)
+#define CHARSET_JISX0213_2  ('P'|CHARSET_DOUBLEBYTE)
+
+#define CHARSET_GB2312      ('A'|CHARSET_DOUBLEBYTE)
+#define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE)
+
+#define CHARSET_DESIGN(c)   ((c) & 0x7f)
+#define CHARSET_ISDBCS(c)   ((c) & 0x80)
+
+#define F_SHIFTED           0x01
+#define F_ESCTHROUGHOUT     0x02
+
+#define STATE_SETG(dn, s, v)    ((s)->c[dn]) = (v);
+#define STATE_GETG(dn, s)       ((s)->c[dn])
+
+#define STATE_SETG0(s, v)   STATE_SETG(0, s, v)
+#define STATE_GETG0(s)      STATE_GETG(0, s)
+#define STATE_SETG1(s, v)   STATE_SETG(1, s, v)
+#define STATE_GETG1(s)      STATE_GETG(1, s)
+#define STATE_SETG2(s, v)   STATE_SETG(2, s, v)
+#define STATE_GETG2(s)      STATE_GETG(2, s)
+#define STATE_SETG3(s, v)   STATE_SETG(3, s, v)
+#define STATE_GETG3(s)      STATE_GETG(3, s)
+
+#define STATE_SETFLAG(s, f)     ((s)->c[4]) |= (f);
+#define STATE_GETFLAG(s, f)     ((s)->c[4] & (f))
+#define STATE_CLEARFLAG(s, f)   ((s)->c[4]) &= ~(f);
+#define STATE_CLEARFLAGS(s)     ((s)->c[4]) = 0;
+
+#define ISO2022_GETCHARSET(charset, c1)                             \
+    if ((c) >= 0x80)                                                \
+        return 1;                                                   \
+    if (STATE_GETFLAG(state, F_SHIFTED)) /* G1 */                   \
+        (charset) = STATE_GETG1(state);                             \
+    else /* G1 */                                                   \
+        (charset) = STATE_GETG0(state);                             \
+
+#ifdef ISO2022_USE_G2_DESIGNATION
+/* hardcoded for iso-2022-jp-2 for now. we'll need to generalize it
+   when we have more G2 designating encodings */
+#define SS2_ROUTINE                                                 \
+    if (IN2 == 'N') { /* SS2 */                                     \
+        RESERVE_INBUF(3)                                            \
+        if (STATE_GETG2(state) == CHARSET_ISO8859_1) {              \
+            ISO8859_1_DECODE(IN3 ^ 0x80, **outbuf)                  \
+            else return 3;                                          \
+        } else if (STATE_GETG2(state) == CHARSET_ISO8859_7) {       \
+            ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)                  \
+            else return 3;                                          \
+        } else if (STATE_GETG2(state) == CHARSET_ASCII) {           \
+            if (IN3 & 0x80) return 3;                               \
+            else **outbuf = IN3;                                    \
+        } else                                                      \
+            return MBERR_INTERNAL;                                  \
+        NEXT(3, 1)                                                  \
+    } else
+#else
+#define SS2_ROUTINE
+#endif
+
+#ifndef ISO2022_NO_SHIFT
+#define SHIFT_CASES                                                 \
+    case SI:                                                        \
+        STATE_CLEARFLAG(state, F_SHIFTED)                           \
+        NEXT_IN(1)                                                  \
+        break;                                                      \
+    case SO:                                                        \
+        STATE_SETFLAG(state, F_SHIFTED)                             \
+        NEXT_IN(1)                                                  \
+        break;
+#else
+/* for compatibility with JapaneseCodecs */
+#define SHIFT_CASES
+#endif
+
+#define ISO2022_BASECASES(c1)                                       \
+    case ESC:                                                       \
+        RESERVE_INBUF(2)                                            \
+        if (IS_ISO2022ESC(IN2)) {                                   \
+            int err;                                                \
+            err = iso2022processesc(state, inbuf, &inleft);         \
+            if (err != 0)                                           \
+                return err;                                         \
+        } else SS2_ROUTINE {                                        \
+            STATE_SETFLAG(state, F_ESCTHROUGHOUT)                   \
+            OUT1(ESC)                                               \
+            NEXT(1, 1)                                              \
+        }                                                           \
+        break;                                                      \
+    SHIFT_CASES                                                     \
+    case '\n':                                                      \
+        STATE_CLEARFLAG(state, F_SHIFTED)                           \
+        WRITE1('\n')                                                \
+        NEXT(1, 1)                                                  \
+        break;
+
+#define ISO2022_ESCTHROUGHOUT(c)                                    \
+    if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) {                    \
+        /* ESC throughout mode: for non-iso2022 escape sequences */ \
+        RESERVE_OUTBUF(1)                                           \
+        OUT1(c) /* assume as ISO-8859-1 */                          \
+        NEXT(1, 1)                                                  \
+        if (IS_ESCEND(c)) {                                         \
+            STATE_CLEARFLAG(state, F_ESCTHROUGHOUT)                 \
+        }                                                           \
+        continue;                                                   \
+    }
+
+#define ISO2022_LOOP_BEGIN                                          \
+    while (inleft > 0) {                                            \
+        unsigned char c = IN1;                                      \
+        ISO2022_ESCTHROUGHOUT(c)                                    \
+        switch(c) {                                                 \
+        ISO2022_BASECASES(c)                                        \
+        default:                                                    \
+            if (c < 0x20) { /* C0 */                                \
+                RESERVE_OUTBUF(1)                                   \
+                OUT1(c)                                             \
+                NEXT(1, 1)                                          \
+            } else if (c >= 0x80)                                   \
+                return 1;                                           \
+            else {
+#define ISO2022_LOOP_END                                            \
+            }                                                       \
+        }                                                           \
+    }
+
+static int
+iso2022processesc(MultibyteCodec_State *state,
+                  const unsigned char **inbuf, size_t *inleft)
+{
+    unsigned char charset, designation;
+    int  i, esclen;
+
+    for (i = 1;i < MAX_ESCSEQLEN;i++) {
+        if (i >= *inleft)
+            return MBERR_TOOFEW;
+        if (IS_ESCEND((*inbuf)[i])) {
+            esclen = i + 1;
+            break;
+        }
+#ifdef ISO2022_USE_JISX0208EXT
+        else if (i+1 < *inleft && (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
+            i += 2;
+#endif
+    }
+
+    if (i >= MAX_ESCSEQLEN)
+        return 1; /* unterminated escape sequence */
+
+    switch (esclen) {
+    case 3:
+        if (IN2 == '$') {
+            charset = IN3 | CHARSET_DOUBLEBYTE;
+            designation = 0;
+        } else {
+            charset = IN3;
+            if (IN2 == '(') designation = 0;
+            else if (IN2 == ')') designation = 1;
+#ifdef ISO2022_USE_G2_DESIGNATION
+            else if (IN2 == '.') designation = 2;
+#endif
+            else return 3;
+        }
+        break;
+    case 4:
+        if (IN2 != '$')
+            return 4;
+
+        charset = IN4 | CHARSET_DOUBLEBYTE;
+        if (IN3 == '(') designation = 0;
+        else if (IN3 == ')') designation = 1;
+        else return 4;
+        break;
+#ifdef ISO2022_USE_JISX0208EXT
+    case 6: /* designation with prefix */
+        if ((*inbuf)[3] == ESC && (*inbuf)[4] == '$' && (*inbuf)[5] == 'B') {
+            charset = 'B' | CHARSET_DOUBLEBYTE;
+            designation = 0;
+        } else
+            return 6;
+        break;
+#endif
+    default:
+        return esclen;
+    }
+
+    { /* raise error when the charset is not designated for this encoding */
+        const unsigned char dsgs[] = {ISO2022_DESIGNATIONS, '\x00'};
+
+        for (i = 0; dsgs[i] != '\x00'; i++)
+            if (dsgs[i] == charset)
+                break;
+
+        if (dsgs[i] == '\x00')
+            return esclen;
+    }
+
+    STATE_SETG(designation, state, charset)
+    *inleft -= esclen;
+    (*inbuf) += esclen;
+    return 0;
+}
commit	3e2a30692085d32ac63f72b35da39158a471fc68	[log] [tgz]
author	Hye-Shik Chang <hyeshik@gmail.com>	Sat Jan 17 14:29:29 2004 +0000
committer	Hye-Shik Chang <hyeshik@gmail.com>	Sat Jan 17 14:29:29 2004 +0000
tree	4cbe735f61eae87ac56a13ca6bd32113b98bd03d
parent	cd1f7430cb8f48de970021071d7683054c23b10f [diff] [blame]