| /** |
| *** Build a deterministic finite automaton to associate CCSIDs with |
| *** character set names. |
| *** |
| *** Compile on OS/400 with options SYSIFCOPT(*IFSIO). |
| *** |
| *** See Copyright for the status of this software. |
| *** |
| *** Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A. |
| **/ |
| |
| #include <stdio.h> |
| #include <errno.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <fcntl.h> |
| #include <ctype.h> |
| |
| #include <iconv.h> |
| |
| |
| #ifdef OLDXML |
| #include "xml.h" |
| #else |
| #include <libxml/hash.h> |
| #include <libxml/parser.h> |
| #include <libxml/xpath.h> |
| #include <libxml/xpathInternals.h> |
| #endif |
| |
| |
| #ifdef __OS400__ |
| #define iconv_open_error(cd) ((cd).return_value == -1) |
| #define set_iconv_open_error(cd) ((cd).return_value = -1) |
| #else |
| #define iconv_open_error(cd) ((cd) == (iconv_t) -1) |
| #define set_iconv_open_error(cd) ((cd) = (iconv_t) -1) |
| #endif |
| |
| |
| #define C_SOURCE_CCSID 500 |
| #define C_UTF8_CCSID 1208 |
| |
| |
| #define UTF8_SPACE 0x20 |
| #define UTF8_HT 0x09 |
| #define UTF8_0 0x30 |
| #define UTF8_9 0x39 |
| #define UTF8_A 0x41 |
| #define UTF8_Z 0x5A |
| #define UTF8_a 0x61 |
| #define UTF8_z 0x7A |
| |
| |
| #define GRANULE 128 /* Memory allocation granule. */ |
| |
| #define EPSILON 0x100 /* Token for empty transition. */ |
| |
| |
| #ifndef OFFSETOF |
| #define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0)) |
| #endif |
| |
| #ifndef OFFSETBY |
| #define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o))) |
| #endif |
| |
| |
| typedef struct t_transition t_transition; /* NFA/DFA transition. */ |
| typedef struct t_state t_state; /* NFA/DFA state node. */ |
| typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */ |
| typedef struct t_chset t_chset; /* Character set. */ |
| typedef struct t_stategroup t_stategroup; /* Optimization group. */ |
| typedef unsigned char utf8char; /* UTF-8 character byte. */ |
| typedef unsigned char byte; /* Untyped data byte. */ |
| |
| |
| typedef struct { /* Set of pointers. */ |
| unsigned int p_size; /* Current allocated size. */ |
| unsigned int p_card; /* Current element count. */ |
| void * p_set[1]; /* Element array. */ |
| } t_powerset; |
| |
| |
| struct t_transition { |
| t_transition * t_forwprev; /* Head of forward transition list. */ |
| t_transition * t_forwnext; /* Tail of forward transition list. */ |
| t_transition * t_backprev; /* Head of backward transition list. */ |
| t_transition * t_backnext; /* Tail of backward transition list. */ |
| t_state * t_from; /* Incoming state. */ |
| t_state * t_to; /* Destination state. */ |
| unsigned short t_token; /* Transition token. */ |
| unsigned int t_index; /* Transition array index. */ |
| }; |
| |
| |
| struct t_state { |
| t_state * s_next; /* Next state (for DFA construction). */ |
| t_state * s_stack; /* Unprocessed DFA states stack. */ |
| t_transition * s_forward; /* Forward transitions. */ |
| t_transition * s_backward; /* Backward transitions. */ |
| t_chset * s_final; /* Recognized character set. */ |
| t_powerset * s_nfastates; /* Corresponding NFA states. */ |
| unsigned int s_index; /* State index. */ |
| }; |
| |
| |
| struct t_symlist { |
| t_symlist * l_next; /* Next name in list. */ |
| utf8char l_symbol[1]; /* Name bytes. */ |
| }; |
| |
| |
| struct t_chset { |
| t_chset * c_next; /* Next character set. */ |
| t_symlist * c_names; /* Character set name list. */ |
| iconv_t c_fromUTF8; /* Conversion from UTF-8. */ |
| unsigned int c_ccsid; /* IBM character set code. */ |
| unsigned int c_mibenum; /* IANA character code. */ |
| }; |
| |
| |
| struct t_stategroup { |
| t_stategroup * g_next; /* Next group. */ |
| t_state * g_member; /* Group member (s_stack) list. */ |
| unsigned int g_id; /* Group ident. */ |
| }; |
| |
| |
| |
| t_chset * chset_list; /* Character set list. */ |
| t_state * initial_state; /* Initial NFA state. */ |
| iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */ |
| iconv_t utf82job; /* UTF-8 to job CCSID conversion. */ |
| t_state * dfa_states; /* List of DFA states. */ |
| unsigned int groupid; /* Group ident counter. */ |
| |
| |
| /** |
| *** UTF-8 strings. |
| **/ |
| |
| #pragma convert(819) |
| |
| static const utf8char utf8_MIBenum[] = "MIBenum"; |
| static const utf8char utf8_mibenum[] = "mibenum"; |
| static const utf8char utf8_ibm_[] = "ibm-"; |
| static const utf8char utf8_IBMCCSID[] = "IBMCCSID"; |
| static const utf8char utf8_iana_[] = "iana-"; |
| static const utf8char utf8_Name[] = "Name"; |
| static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name"; |
| static const utf8char utf8_Aliases[] = "Aliases"; |
| static const utf8char utf8_html[] = "html"; |
| static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml"; |
| static const utf8char utf8_A[] = "A"; |
| static const utf8char utf8_C[] = "C"; |
| static const utf8char utf8_M[] = "M"; |
| static const utf8char utf8_N[] = "N"; |
| static const utf8char utf8_P[] = "P"; |
| static const utf8char utf8_T[] = "T"; |
| static const utf8char utf8_ccsid[] = "ccsid"; |
| static const utf8char utf8_EBCDIC[] = "EBCDIC"; |
| static const utf8char utf8_ASCII[] = "ASCII"; |
| static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]"; |
| static const utf8char utf8_aliastext[] = |
| "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()"; |
| #ifdef OLDXML |
| static const utf8char utf8_tablerows[] = |
| "//table[@id='table-character-sets-1']/*/tr"; |
| static const utf8char utf8_headerpos[] = |
| "count(th[text()=$T]/preceding-sibling::th)+1"; |
| static const utf8char utf8_getmibenum[] = "number(td[$M])"; |
| static const utf8char utf8_getprefname[] = "string(td[$P])"; |
| static const utf8char utf8_getname[] = "string(td[$N])"; |
| static const utf8char utf8_getaliases[] = "td[$A]/text()"; |
| #else |
| static const utf8char utf8_tablerows[] = |
| "//html:table[@id='table-character-sets-1']/*/html:tr"; |
| static const utf8char utf8_headerpos[] = |
| "count(html:th[text()=$T]/preceding-sibling::html:th)+1"; |
| static const utf8char utf8_getmibenum[] = "number(html:td[$M])"; |
| static const utf8char utf8_getprefname[] = "string(html:td[$P])"; |
| static const utf8char utf8_getname[] = "string(html:td[$N])"; |
| static const utf8char utf8_getaliases[] = "html:td[$A]/text()"; |
| #endif |
| |
| #pragma convert(0) |
| |
| |
| /** |
| *** UTF-8 character length table. |
| *** |
| *** Index is first character byte, value is the character byte count. |
| **/ |
| |
| static signed char utf8_chlen[] = { |
| /* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1, |
| /* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2, |
| /* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2, |
| /* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2, |
| /* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2, |
| /* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3, |
| /* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3, |
| /* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4, |
| /* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1 |
| }; |
| |
| |
| |
| void |
| chknull(void * p) |
| |
| { |
| if (p) |
| return; |
| |
| fprintf(stderr, "Not enough memory\n"); |
| exit(1); |
| } |
| |
| |
| void |
| makecode(char * buf, unsigned int ccsid) |
| |
| { |
| ccsid &= 0xFFFF; |
| memset(buf, 0, 32); |
| sprintf(buf, "IBMCCSID%05u0000000", ccsid); |
| } |
| |
| |
| iconv_t |
| iconv_open_ccsid(unsigned int ccsidout, |
| unsigned int ccsidin, unsigned int nullflag) |
| |
| { |
| char fromcode[33]; |
| char tocode[33]; |
| |
| makecode(fromcode, ccsidin); |
| makecode(tocode, ccsidout); |
| memset(tocode + 13, 0, sizeof tocode - 13); |
| |
| if (nullflag) |
| fromcode[18] = '1'; |
| |
| return iconv_open(tocode, fromcode); |
| } |
| |
| |
| unsigned int |
| getnum(char * * cpp) |
| |
| { |
| unsigned int n; |
| char * cp; |
| |
| cp = *cpp; |
| n = 0; |
| |
| while (isdigit(*cp)) |
| n = 10 * n + *cp++ - '0'; |
| |
| *cpp = cp; |
| return n; |
| } |
| |
| |
| const utf8char * |
| hashBinaryKey(const byte * bytes, unsigned int len) |
| |
| { |
| const byte * bp; |
| utf8char * key; |
| utf8char * cp; |
| unsigned int n; |
| unsigned int n4; |
| unsigned int i; |
| |
| /** |
| *** Encode binary data in character form to be used as hash |
| *** table key. |
| **/ |
| |
| n = (4 * len + 2) / 3; |
| key = (utf8char *) malloc(n + 1); |
| chknull(key); |
| bp = bytes; |
| cp = key; |
| |
| for (n4 = n >> 2; n4; n4--) { |
| i = (bp[0] << 16) | (bp[1] << 8) | bp[2]; |
| *cp++ = 0x21 + ((i >> 18) & 0x3F); |
| *cp++ = 0x21 + ((i >> 12) & 0x3F); |
| *cp++ = 0x21 + ((i >> 6) & 0x3F); |
| *cp++ = 0x21 + (i & 0x3F); |
| bp += 3; |
| } |
| |
| switch (n & 0x3) { |
| |
| case 2: |
| *cp++ = 0x21 + ((*bp >> 2) & 0x3F); |
| *cp++ = 0x21 + ((*bp << 4) & 0x3F); |
| break; |
| |
| case 3: |
| i = (bp[0] << 8) | bp[1]; |
| *cp++ = 0x21 + ((i >> 10) & 0x3F); |
| *cp++ = 0x21 + ((i >> 4) & 0x3F); |
| *cp++ = 0x21 + ((i << 2) & 0x3F); |
| break; |
| } |
| |
| *cp = '\0'; |
| return key; |
| } |
| |
| |
| void * |
| hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len) |
| |
| { |
| const utf8char * key; |
| void * result; |
| |
| key = hashBinaryKey((const byte *) binkey, len); |
| result = xmlHashLookup(h, key); |
| free((char *) key); |
| return result; |
| } |
| |
| |
| int |
| hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data) |
| |
| { |
| const utf8char * key; |
| int result; |
| |
| key = hashBinaryKey((const byte *) binkey, len); |
| result = xmlHashAddEntry(h, key, data); |
| free((char *) key); |
| return result; |
| } |
| |
| |
| xmlDocPtr |
| loadXMLFile(const char * filename) |
| |
| { |
| struct stat sbuf; |
| byte * databuf; |
| int fd; |
| int i; |
| xmlDocPtr doc; |
| |
| if (stat(filename, &sbuf)) |
| return (xmlDocPtr) NULL; |
| |
| databuf = malloc(sbuf.st_size + 4); |
| |
| if (!databuf) |
| return (xmlDocPtr) NULL; |
| |
| fd = open(filename, O_RDONLY |
| #ifdef O_BINARY |
| | O_BINARY |
| #endif |
| ); |
| |
| if (fd < 0) { |
| free((char *) databuf); |
| return (xmlDocPtr) NULL; |
| } |
| |
| i = read(fd, (char *) databuf, sbuf.st_size); |
| close(fd); |
| |
| if (i != sbuf.st_size) { |
| free((char *) databuf); |
| return (xmlDocPtr) NULL; |
| } |
| |
| databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0; |
| doc = xmlParseMemory((xmlChar *) databuf, i); |
| free((char *) databuf); |
| return doc; |
| } |
| |
| |
| int |
| match(char * * cpp, char * s) |
| |
| { |
| char * cp; |
| int c1; |
| int c2; |
| |
| cp = *cpp; |
| |
| for (cp = *cpp; c2 = *s++; cp++) { |
| c1 = *cp; |
| |
| if (c1 != c2) { |
| if (isupper(c1)) |
| c1 = tolower(c1); |
| |
| if (isupper(c2)) |
| c2 = tolower(c2); |
| } |
| |
| if (c1 != c2) |
| return 0; |
| } |
| |
| c1 = *cp; |
| |
| while (c1 == ' ' || c1 == '\t') |
| c1 = *++cp; |
| |
| *cpp = cp; |
| return 1; |
| } |
| |
| |
| t_state * |
| newstate(void) |
| |
| { |
| t_state * s; |
| |
| s = (t_state *) malloc(sizeof *s); |
| chknull(s); |
| memset((char *) s, 0, sizeof *s); |
| return s; |
| } |
| |
| |
| void |
| unlink_transition(t_transition * t) |
| |
| { |
| if (t->t_backnext) |
| t->t_backnext->t_backprev = t->t_backprev; |
| |
| if (t->t_backprev) |
| t->t_backprev->t_backnext = t->t_backnext; |
| else if (t->t_to) |
| t->t_to->s_backward = t->t_backnext; |
| |
| if (t->t_forwnext) |
| t->t_forwnext->t_forwprev = t->t_forwprev; |
| |
| if (t->t_forwprev) |
| t->t_forwprev->t_forwnext = t->t_forwnext; |
| else if (t->t_from) |
| t->t_from->s_forward = t->t_forwnext; |
| |
| t->t_backprev = (t_transition *) NULL; |
| t->t_backnext = (t_transition *) NULL; |
| t->t_forwprev = (t_transition *) NULL; |
| t->t_forwnext = (t_transition *) NULL; |
| t->t_from = (t_state *) NULL; |
| t->t_to = (t_state *) NULL; |
| } |
| |
| |
| void |
| link_transition(t_transition * t, t_state * from, t_state * to) |
| |
| { |
| if (!from) |
| from = t->t_from; |
| |
| if (!to) |
| to = t->t_to; |
| |
| unlink_transition(t); |
| |
| if ((t->t_from = from)) { |
| if ((t->t_forwnext = from->s_forward)) |
| t->t_forwnext->t_forwprev = t; |
| |
| from->s_forward = t; |
| } |
| |
| if ((t->t_to = to)) { |
| if ((t->t_backnext = to->s_backward)) |
| t->t_backnext->t_backprev = t; |
| |
| to->s_backward = t; |
| } |
| } |
| |
| |
| t_transition * |
| newtransition(unsigned int token, t_state * from, t_state * to) |
| |
| { |
| t_transition * t; |
| |
| t = (t_transition *) malloc(sizeof *t); |
| chknull(t); |
| memset((char *) t, 0, sizeof *t); |
| t->t_token = token; |
| link_transition(t, from, to); |
| return t; |
| } |
| |
| |
| t_transition * |
| uniquetransition(unsigned int token, t_state * from, t_state * to) |
| |
| { |
| t_transition * t; |
| |
| for (t = from->s_forward; t; t = t->t_forwnext) |
| if (t->t_token == token && (t->t_to == to || !to)) |
| return t; |
| |
| return to? newtransition(token, from, to): (t_transition *) NULL; |
| } |
| |
| |
| int |
| set_position(t_powerset * s, void * e) |
| |
| { |
| unsigned int l; |
| unsigned int h; |
| unsigned int m; |
| int i; |
| |
| l = 0; |
| h = s->p_card; |
| |
| while (l < h) { |
| m = (l + h) >> 1; |
| |
| /** |
| *** If both pointers belong to different allocation arenas, |
| *** native comparison may find them neither |
| *** equal, nor greater, nor smaller. |
| *** We thus compare using memcmp() to get an orthogonal |
| *** result. |
| **/ |
| |
| i = memcmp(&e, s->p_set + m, sizeof e); |
| |
| if (i < 0) |
| h = m; |
| else if (!i) |
| return m; |
| else |
| l = m + 1; |
| } |
| |
| return l; |
| } |
| |
| |
| t_powerset * |
| set_include(t_powerset * s, void * e) |
| |
| { |
| unsigned int pos; |
| unsigned int n; |
| |
| if (!s) { |
| s = (t_powerset *) malloc(sizeof *s + |
| GRANULE * sizeof s->p_set); |
| chknull(s); |
| s->p_size = GRANULE; |
| s->p_set[GRANULE] = (t_state *) NULL; |
| s->p_set[0] = e; |
| s->p_card = 1; |
| return s; |
| } |
| |
| pos = set_position(s, e); |
| |
| if (pos < s->p_card && s->p_set[pos] == e) |
| return s; |
| |
| if (s->p_card >= s->p_size) { |
| s->p_size += GRANULE; |
| s = (t_powerset *) realloc(s, |
| sizeof *s + s->p_size * sizeof s->p_set); |
| chknull(s); |
| s->p_set[s->p_size] = (t_state *) NULL; |
| } |
| |
| n = s->p_card - pos; |
| |
| if (n) |
| memmove((char *) (s->p_set + pos + 1), |
| (char *) (s->p_set + pos), n * sizeof s->p_set[0]); |
| |
| s->p_set[pos] = e; |
| s->p_card++; |
| return s; |
| } |
| |
| |
| t_state * |
| nfatransition(t_state * to, byte token) |
| |
| { |
| t_state * from; |
| |
| from = newstate(); |
| newtransition(token, from, to); |
| return from; |
| } |
| |
| |
| static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc, |
| const utf8char * name, unsigned int len); |
| |
| |
| void |
| nfaslice(t_state * * from, t_state * * to, iconv_t icc, |
| const utf8char * chr, unsigned int chlen, |
| const utf8char * name, unsigned int len, t_state * final) |
| |
| { |
| char * srcp; |
| char * dstp; |
| size_t srcc; |
| size_t dstc; |
| unsigned int cnt; |
| t_state * f; |
| t_state * t; |
| t_transition * tp; |
| byte bytebuf[8]; |
| |
| srcp = (char *) chr; |
| srcc = chlen; |
| dstp = (char *) bytebuf; |
| dstc = sizeof bytebuf; |
| iconv(icc, &srcp, &srcc, &dstp, &dstc); |
| dstp = (char *) bytebuf; |
| cnt = sizeof bytebuf - dstc; |
| t = *to; |
| f = *from; |
| |
| /** |
| *** Check for end of string. |
| **/ |
| |
| if (!len) |
| if (t && t != final) |
| uniquetransition(EPSILON, t, final); |
| else |
| t = final; |
| |
| if (f) |
| while (cnt) { |
| tp = uniquetransition(*dstp, f, (t_state *) NULL); |
| |
| if (!tp) |
| break; |
| |
| f = tp->t_to; |
| dstp++; |
| cnt--; |
| } |
| |
| if (!cnt) { |
| if (!t) |
| t = nfadevelop(f, final, icc, name, len); |
| |
| *to = t; |
| return; |
| } |
| |
| if (!t) { |
| t = nfadevelop((t_state *) NULL, final, icc, name, len); |
| *to = t; |
| } |
| |
| if (!f) |
| *from = f = newstate(); |
| |
| while (cnt > 1) |
| t = nfatransition(t, dstp[--cnt]); |
| |
| newtransition(*dstp, f, t); |
| } |
| |
| |
| t_state * |
| nfadevelop(t_state * from, t_state * final, iconv_t icc, |
| const utf8char * name, unsigned int len) |
| |
| { |
| int chlen; |
| int i; |
| t_state * to; |
| int uccnt; |
| int lccnt; |
| utf8char chr; |
| |
| chlen = utf8_chlen[*name]; |
| |
| for (i = 1; i < chlen; i++) |
| if ((name[i] & 0xC0) != 0x80) |
| break; |
| |
| if (i != chlen) { |
| fprintf(stderr, |
| "Invalid UTF8 character in character set name\n"); |
| return (t_state *) NULL; |
| } |
| |
| to = (t_state *) NULL; |
| nfaslice(&from, &to, |
| icc, name, chlen, name + chlen, len - chlen, final); |
| |
| if (*name >= UTF8_a && *name <= UTF8_z) |
| chr = *name - UTF8_a + UTF8_A; |
| else if (*name >= UTF8_A && *name <= UTF8_Z) |
| chr = *name - UTF8_A + UTF8_a; |
| else |
| return from; |
| |
| nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final); |
| return from; |
| } |
| |
| |
| |
| void |
| nfaenter(const utf8char * name, int len, t_chset * charset) |
| |
| { |
| t_chset * s; |
| t_state * final; |
| t_state * sp; |
| t_symlist * lp; |
| |
| /** |
| *** Enter case-insensitive `name' in NFA in all known |
| *** character codes. |
| *** Redundant shift state changes as well as shift state |
| *** differences between uppercase and lowercase are |
| *** not handled. |
| **/ |
| |
| if (len < 0) |
| len = strlen(name) + 1; |
| |
| for (lp = charset->c_names; lp; lp = lp->l_next) |
| if (!memcmp(name, lp->l_symbol, len)) |
| return; /* Already entered. */ |
| |
| lp = (t_symlist *) malloc(sizeof *lp + len); |
| chknull(lp); |
| memcpy(lp->l_symbol, name, len); |
| lp->l_symbol[len] = '\0'; |
| lp->l_next = charset->c_names; |
| charset->c_names = lp; |
| final = newstate(); |
| final->s_final = charset; |
| |
| for (s = chset_list; s; s = s->c_next) |
| if (!iconv_open_error(s->c_fromUTF8)) |
| sp = nfadevelop(initial_state, final, |
| s->c_fromUTF8, name, len); |
| } |
| |
| |
| unsigned int |
| utf8_utostr(utf8char * s, unsigned int v) |
| |
| { |
| unsigned int d; |
| unsigned int i; |
| |
| d = v / 10; |
| v -= d * 10; |
| i = d? utf8_utostr(s, d): 0; |
| s[i++] = v + UTF8_0; |
| s[i] = '\0'; |
| return i; |
| } |
| |
| |
| unsigned int |
| utf8_utostrpad(utf8char * s, unsigned int v, int digits) |
| |
| { |
| unsigned int i = utf8_utostr(s, v); |
| utf8char pad = UTF8_SPACE; |
| |
| if (digits < 0) { |
| pad = UTF8_0; |
| digits = -digits; |
| } |
| |
| if (i >= digits) |
| return i; |
| |
| memmove(s + digits - i, s, i + 1); |
| memset(s, pad, digits - i); |
| return digits; |
| } |
| |
| |
| unsigned int |
| utf8_strtou(const utf8char * s) |
| |
| { |
| unsigned int v; |
| |
| while (*s == UTF8_SPACE || *s == UTF8_HT) |
| s++; |
| |
| for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;) |
| v = 10 * v + *s++ - UTF8_0; |
| |
| return v; |
| } |
| |
| |
| unsigned int |
| getNumAttr(xmlNodePtr node, const xmlChar * name) |
| |
| { |
| const xmlChar * s; |
| unsigned int val; |
| |
| s = xmlGetProp(node, name); |
| |
| if (!s) |
| return 0; |
| |
| val = utf8_strtou(s); |
| xmlFree((xmlChar *) s); |
| return val; |
| } |
| |
| |
| void |
| read_assocs(const char * filename) |
| |
| { |
| xmlDocPtr doc; |
| xmlXPathContextPtr ctxt; |
| xmlXPathObjectPtr obj; |
| xmlNodePtr node; |
| t_chset * sp; |
| int i; |
| unsigned int ccsid; |
| unsigned int mibenum; |
| utf8char symbuf[32]; |
| |
| doc = loadXMLFile(filename); |
| |
| if (!doc) { |
| fprintf(stderr, "Cannot load file %s\n", filename); |
| exit(1); |
| } |
| |
| ctxt = xmlXPathNewContext(doc); |
| obj = xmlXPathEval(utf8_assocnodes, ctxt); |
| |
| if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval || |
| !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) { |
| fprintf(stderr, "No association found in %s\n", filename); |
| exit(1); |
| } |
| |
| for (i = 0; i < obj->nodesetval->nodeNr; i++) { |
| node = obj->nodesetval->nodeTab[i]; |
| ccsid = getNumAttr(node, utf8_ccsid); |
| mibenum = getNumAttr(node, utf8_mibenum); |
| |
| /** |
| *** Check for duplicate. |
| **/ |
| |
| for (sp = chset_list; sp; sp = sp->c_next) |
| if (ccsid && ccsid == sp->c_ccsid || |
| mibenum && mibenum == sp->c_mibenum) { |
| fprintf(stderr, "Duplicate character set: "); |
| fprintf(stderr, "CCSID = %u/%u, ", |
| ccsid, sp->c_ccsid); |
| fprintf(stderr, "MIBenum = %u/%u\n", |
| mibenum, sp->c_mibenum); |
| break; |
| } |
| |
| if (sp) |
| continue; |
| |
| /** |
| *** Allocate the new character set. |
| **/ |
| |
| sp = (t_chset *) malloc(sizeof *sp); |
| chknull(sp); |
| memset(sp, 0, sizeof *sp); |
| |
| if (!ccsid) /* Do not attempt with current job CCSID. */ |
| set_iconv_open_error(sp->c_fromUTF8); |
| else { |
| sp->c_fromUTF8 = |
| iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0); |
| |
| if (iconv_open_error(sp->c_fromUTF8) == -1) |
| fprintf(stderr, |
| "Cannot convert into CCSID %u: ignored\n", |
| ccsid); |
| } |
| |
| sp->c_ccsid = ccsid; |
| sp->c_mibenum = mibenum; |
| sp->c_next = chset_list; |
| chset_list = sp; |
| } |
| |
| xmlXPathFreeObject(obj); |
| |
| /** |
| *** Enter aliases. |
| **/ |
| |
| for (sp = chset_list; sp; sp = sp->c_next) { |
| strcpy(symbuf, utf8_ibm_); |
| utf8_utostr(symbuf + 4, sp->c_ccsid); |
| nfaenter(symbuf, -1, sp); |
| strcpy(symbuf, utf8_IBMCCSID); |
| utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5); |
| nfaenter(symbuf, 13, sp); /* Not null-terminated. */ |
| |
| if (sp->c_mibenum) { |
| strcpy(symbuf, utf8_iana_); |
| utf8_utostr(symbuf + 5, sp->c_mibenum); |
| nfaenter(symbuf, -1, sp); |
| } |
| |
| xmlXPathRegisterVariable(ctxt, utf8_C, |
| xmlXPathNewFloat((double) sp->c_ccsid)); |
| obj = xmlXPathEval(utf8_aliastext, ctxt); |
| |
| if (!obj || obj->type != XPATH_NODESET) { |
| fprintf(stderr, "getAlias failed in %s\n", filename); |
| exit(1); |
| } |
| |
| if (obj->nodesetval && |
| obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) { |
| for (i = 0; i < obj->nodesetval->nodeNr; i++) { |
| node = obj->nodesetval->nodeTab[i]; |
| nfaenter(node->content, -1, sp); |
| } |
| } |
| |
| xmlXPathFreeObject(obj); |
| } |
| |
| xmlXPathFreeContext(ctxt); |
| xmlFreeDoc(doc); |
| } |
| |
| |
| unsigned int |
| columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header) |
| |
| { |
| xmlXPathObjectPtr obj; |
| unsigned int res = 0; |
| |
| xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header)); |
| obj = xmlXPathEval(utf8_headerpos, ctxt); |
| |
| if (obj) { |
| if (obj->type == XPATH_NUMBER) |
| res = (unsigned int) obj->floatval; |
| |
| xmlXPathFreeObject(obj); |
| } |
| |
| return res; |
| } |
| |
| |
| void |
| read_iana(const char * filename) |
| |
| { |
| xmlDocPtr doc; |
| xmlXPathContextPtr ctxt; |
| xmlXPathObjectPtr obj1; |
| xmlXPathObjectPtr obj2; |
| xmlNodePtr node; |
| int prefnamecol; |
| int namecol; |
| int mibenumcol; |
| int aliascol; |
| int mibenum; |
| t_chset * sp; |
| int n; |
| int i; |
| |
| doc = loadXMLFile(filename); |
| |
| if (!doc) { |
| fprintf(stderr, "Cannot load file %s\n", filename); |
| exit(1); |
| } |
| |
| ctxt = xmlXPathNewContext(doc); |
| |
| #ifndef OLDXML |
| xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri); |
| #endif |
| |
| obj1 = xmlXPathEval(utf8_tablerows, ctxt); |
| |
| if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval || |
| !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) { |
| fprintf(stderr, "No data in %s\n", filename); |
| exit(1); |
| } |
| |
| /** |
| *** Identify columns. |
| **/ |
| |
| xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt); |
| prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name); |
| namecol = columnPosition(ctxt, utf8_Name); |
| mibenumcol = columnPosition(ctxt, utf8_MIBenum); |
| aliascol = columnPosition(ctxt, utf8_Aliases); |
| |
| if (!prefnamecol || !namecol || !mibenumcol || !aliascol) { |
| fprintf(stderr, "Key column(s) missing in %s\n", filename); |
| exit(1); |
| } |
| |
| xmlXPathRegisterVariable(ctxt, utf8_P, |
| xmlXPathNewFloat((double) prefnamecol)); |
| xmlXPathRegisterVariable(ctxt, utf8_N, |
| xmlXPathNewFloat((double) namecol)); |
| xmlXPathRegisterVariable(ctxt, utf8_M, |
| xmlXPathNewFloat((double) mibenumcol)); |
| xmlXPathRegisterVariable(ctxt, utf8_A, |
| xmlXPathNewFloat((double) aliascol)); |
| |
| /** |
| *** Process each row. |
| **/ |
| |
| for (n = 1; n < obj1->nodesetval->nodeNr; n++) { |
| xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt); |
| |
| /** |
| *** Get the MIBenum from current row. |
| */ |
| |
| obj2 = xmlXPathEval(utf8_getmibenum, ctxt); |
| |
| if (!obj2 || obj2->type != XPATH_NUMBER) { |
| fprintf(stderr, "get MIBenum failed at row %u\n", n); |
| exit(1); |
| } |
| |
| if (xmlXPathIsNaN(obj2->floatval) || |
| obj2->floatval < 1.0 || obj2->floatval > 65535.0 || |
| ((unsigned int) obj2->floatval) != obj2->floatval) { |
| fprintf(stderr, "invalid MIBenum at row %u\n", n); |
| xmlXPathFreeObject(obj2); |
| continue; |
| } |
| |
| mibenum = obj2->floatval; |
| xmlXPathFreeObject(obj2); |
| |
| /** |
| *** Search the associations for a corresponding CCSID. |
| **/ |
| |
| for (sp = chset_list; sp; sp = sp->c_next) |
| if (sp->c_mibenum == mibenum) |
| break; |
| |
| if (!sp) |
| continue; /* No CCSID for this MIBenum. */ |
| |
| /** |
| *** Process preferred MIME name. |
| **/ |
| |
| obj2 = xmlXPathEval(utf8_getprefname, ctxt); |
| |
| if (!obj2 || obj2->type != XPATH_STRING) { |
| fprintf(stderr, |
| "get Preferred_MIME_Name failed at row %u\n", n); |
| exit(1); |
| } |
| |
| if (obj2->stringval && obj2->stringval[0]) |
| nfaenter(obj2->stringval, -1, sp); |
| |
| xmlXPathFreeObject(obj2); |
| |
| /** |
| *** Process name. |
| **/ |
| |
| obj2 = xmlXPathEval(utf8_getname, ctxt); |
| |
| if (!obj2 || obj2->type != XPATH_STRING) { |
| fprintf(stderr, "get name failed at row %u\n", n); |
| exit(1); |
| } |
| |
| if (obj2->stringval && obj2->stringval[0]) |
| nfaenter(obj2->stringval, -1, sp); |
| |
| xmlXPathFreeObject(obj2); |
| |
| /** |
| *** Process aliases. |
| **/ |
| |
| obj2 = xmlXPathEval(utf8_getaliases, ctxt); |
| |
| if (!obj2 || obj2->type != XPATH_NODESET) { |
| fprintf(stderr, "get aliases failed at row %u\n", n); |
| exit(1); |
| } |
| |
| if (obj2->nodesetval && obj2->nodesetval->nodeTab) |
| for (i = 0; i < obj2->nodesetval->nodeNr; i++) { |
| node = obj2->nodesetval->nodeTab[i]; |
| |
| if (node && node->content && node->content[0]) |
| nfaenter(node->content, -1, sp); |
| } |
| |
| xmlXPathFreeObject(obj2); |
| } |
| |
| xmlXPathFreeObject(obj1); |
| xmlXPathFreeContext(ctxt); |
| xmlFreeDoc(doc); |
| } |
| |
| |
| t_powerset * closureset(t_powerset * dst, t_powerset * src); |
| |
| |
| t_powerset * |
| closure(t_powerset * dst, t_state * src) |
| |
| { |
| t_transition * t; |
| unsigned int oldcard; |
| |
| if (src->s_nfastates) { |
| /** |
| *** Is a DFA state: return closure of set of equivalent |
| *** NFA states. |
| **/ |
| |
| return closureset(dst, src->s_nfastates); |
| } |
| |
| /** |
| *** Compute closure of NFA state. |
| **/ |
| |
| dst = set_include(dst, src); |
| |
| for (t = src->s_forward; t; t = t->t_forwnext) |
| if (t->t_token == EPSILON) { |
| oldcard = dst->p_card; |
| dst = set_include(dst, t->t_to); |
| |
| if (oldcard != dst->p_card) |
| dst = closure(dst, t->t_to); |
| } |
| |
| return dst; |
| } |
| |
| |
| t_powerset * |
| closureset(t_powerset * dst, t_powerset * src) |
| |
| { |
| unsigned int i; |
| |
| for (i = 0; i < src->p_card; i++) |
| dst = closure(dst, (t_state *) src->p_set[i]); |
| |
| return dst; |
| } |
| |
| |
| t_state * |
| get_dfa_state(t_state * * stack, |
| t_powerset * nfastates, xmlHashTablePtr sethash) |
| |
| { |
| t_state * s; |
| |
| if (s = hash_get(sethash, nfastates->p_set, |
| nfastates->p_card * sizeof nfastates->p_set[0])) { |
| /** |
| *** DFA state already present. |
| *** Release the NFA state set and return |
| *** the address of the old DFA state. |
| **/ |
| |
| free((char *) nfastates); |
| return s; |
| } |
| |
| /** |
| *** Build the new state. |
| **/ |
| |
| s = newstate(); |
| s->s_nfastates = nfastates; |
| s->s_next = dfa_states; |
| dfa_states = s; |
| s->s_stack = *stack; |
| *stack = s; |
| |
| /** |
| *** Enter it in hash. |
| **/ |
| |
| if (hash_add(sethash, nfastates->p_set, |
| nfastates->p_card * sizeof nfastates->p_set[0], s)) |
| chknull(NULL); /* Memory allocation error. */ |
| |
| return s; |
| } |
| |
| |
| int |
| transcmp(const void * p1, const void * p2) |
| |
| { |
| t_transition * t1; |
| t_transition * t2; |
| |
| t1 = *(t_transition * *) p1; |
| t2 = *(t_transition * *) p2; |
| return ((int) t1->t_token) - ((int) t2->t_token); |
| } |
| |
| |
| void |
| builddfa(void) |
| |
| { |
| t_powerset * transset; |
| t_powerset * stateset; |
| t_state * s; |
| t_state * s2; |
| unsigned int n; |
| unsigned int i; |
| unsigned int token; |
| t_transition * t; |
| t_state * stack; |
| xmlHashTablePtr sethash; |
| unsigned int nst; |
| |
| transset = set_include(NULL, NULL); |
| chknull(transset); |
| stateset = set_include(NULL, NULL); |
| chknull(stateset); |
| sethash = xmlHashCreate(1); |
| chknull(sethash); |
| dfa_states = (t_state *) NULL; |
| stack = (t_state *) NULL; |
| nst = 0; |
| |
| /** |
| *** Build the DFA initial state. |
| **/ |
| |
| get_dfa_state(&stack, closure(NULL, initial_state), sethash); |
| |
| /** |
| *** Build the other DFA states by looking at each |
| *** possible transition from stacked DFA states. |
| **/ |
| |
| do { |
| if (!(++nst % 100)) |
| fprintf(stderr, "%u DFA states\n", nst); |
| |
| s = stack; |
| stack = s->s_stack; |
| s->s_stack = (t_state *) NULL; |
| |
| /** |
| *** Build a set of all non-epsilon transitions from this |
| *** state. |
| **/ |
| |
| transset->p_card = 0; |
| |
| for (n = 0; n < s->s_nfastates->p_card; n++) { |
| s2 = s->s_nfastates->p_set[n]; |
| |
| for (t = s2->s_forward; t; t = t->t_forwnext) |
| if (t->t_token != EPSILON) { |
| transset = set_include(transset, t); |
| chknull(transset); |
| } |
| } |
| |
| /** |
| *** Sort transitions by token. |
| **/ |
| |
| qsort(transset->p_set, transset->p_card, |
| sizeof transset->p_set[0], transcmp); |
| |
| /** |
| *** Process all transitions, grouping them by token. |
| **/ |
| |
| stateset->p_card = 0; |
| token = EPSILON; |
| |
| for (i = 0; i < transset->p_card; i++) { |
| t = transset->p_set[i]; |
| |
| if (token != t->t_token) { |
| if (stateset->p_card) { |
| /** |
| *** Get the equivalent DFA state |
| *** and create transition. |
| **/ |
| |
| newtransition(token, s, |
| get_dfa_state(&stack, |
| closureset(NULL, stateset), |
| sethash)); |
| stateset->p_card = 0; |
| } |
| |
| token = t->t_token; |
| } |
| |
| stateset = set_include(stateset, t->t_to); |
| } |
| |
| if (stateset->p_card) |
| newtransition(token, s, get_dfa_state(&stack, |
| closureset(NULL, stateset), sethash)); |
| } while (stack); |
| |
| free((char *) transset); |
| free((char *) stateset); |
| xmlHashFree(sethash, NULL); |
| |
| /** |
| *** Reverse the state list to get the initial state first, |
| *** check for ambiguous prefixes, determine final states, |
| *** destroy NFA state sets. |
| **/ |
| |
| while (s = dfa_states) { |
| dfa_states = s->s_next; |
| s->s_next = stack; |
| stack = s; |
| stateset = s->s_nfastates; |
| s->s_nfastates = (t_powerset *) NULL; |
| |
| for (n = 0; n < stateset->p_card; n++) { |
| s2 = (t_state *) stateset->p_set[n]; |
| |
| if (s2->s_final) { |
| if (s->s_final && s->s_final != s2->s_final) |
| fprintf(stderr, |
| "Ambiguous name for CCSIDs %u/%u\n", |
| s->s_final->c_ccsid, |
| s2->s_final->c_ccsid); |
| |
| s->s_final = s2->s_final; |
| } |
| } |
| |
| free((char *) stateset); |
| } |
| |
| dfa_states = stack; |
| } |
| |
| |
| void |
| deletenfa(void) |
| |
| { |
| t_transition * t; |
| t_state * s; |
| t_state * u; |
| t_state * stack; |
| |
| stack = initial_state; |
| stack->s_stack = (t_state *) NULL; |
| |
| while ((s = stack)) { |
| stack = s->s_stack; |
| |
| while ((t = s->s_forward)) { |
| u = t->t_to; |
| unlink_transition(t); |
| free((char *) t); |
| |
| if (!u->s_backward) { |
| u->s_stack = stack; |
| stack = u; |
| } |
| } |
| |
| free((char *) s); |
| } |
| } |
| |
| |
| t_stategroup * |
| newgroup(void) |
| |
| { |
| t_stategroup * g; |
| |
| g = (t_stategroup *) malloc(sizeof *g); |
| chknull(g); |
| memset((char *) g, 0, sizeof *g); |
| g->g_id = groupid++; |
| return g; |
| } |
| |
| |
| void |
| optimizedfa(void) |
| |
| { |
| unsigned int i; |
| xmlHashTablePtr h; |
| t_state * s1; |
| t_state * s2; |
| t_state * finstates; |
| t_state * * sp; |
| t_stategroup * g1; |
| t_stategroup * g2; |
| t_stategroup * ghead; |
| t_transition * t1; |
| t_transition * t2; |
| unsigned int done; |
| unsigned int startgroup; |
| unsigned int gtrans[1 << (8 * sizeof(unsigned char))]; |
| |
| /** |
| *** Reduce DFA state count. |
| **/ |
| |
| groupid = 0; |
| ghead = (t_stategroup *) NULL; |
| |
| /** |
| *** First split: non-final and each distinct final states. |
| **/ |
| |
| h = xmlHashCreate(4); |
| chknull(h); |
| |
| for (s1 = dfa_states; s1; s1 = s1->s_next) { |
| if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) { |
| g1 = newgroup(); |
| g1->g_next = ghead; |
| ghead = g1; |
| |
| if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1)) |
| chknull(NULL); /* Memory allocation error. */ |
| } |
| |
| s1->s_index = g1->g_id; |
| s1->s_stack = g1->g_member; |
| g1->g_member = s1; |
| } |
| |
| xmlHashFree(h, NULL); |
| |
| /** |
| *** Subsequent splits: states that have the same forward |
| *** transition tokens to states in the same group. |
| **/ |
| |
| do { |
| done = 1; |
| |
| for (g2 = ghead; g2; g2 = g2->g_next) { |
| s1 = g2->g_member; |
| |
| if (!s1->s_stack) |
| continue; |
| |
| h = xmlHashCreate(1); |
| chknull(h); |
| |
| /** |
| *** Build the group transition map. |
| **/ |
| |
| memset((char *) gtrans, ~0, sizeof gtrans); |
| |
| for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext) |
| gtrans[t1->t_token] = t1->t_to->s_index; |
| |
| if (hash_add(h, gtrans, sizeof gtrans, g2)) |
| chknull(NULL); |
| |
| /** |
| *** Process other states in group. |
| **/ |
| |
| sp = &s1->s_stack; |
| s1 = *sp; |
| |
| do { |
| *sp = s1->s_stack; |
| |
| /** |
| *** Build the transition map. |
| **/ |
| |
| memset((char *) gtrans, ~0, sizeof gtrans); |
| |
| for (t1 = s1->s_forward; |
| t1; t1 = t1->t_forwnext) |
| gtrans[t1->t_token] = t1->t_to->s_index; |
| |
| g1 = hash_get(h, gtrans, sizeof gtrans); |
| |
| if (g1 == g2) { |
| *sp = s1; |
| sp = &s1->s_stack; |
| } |
| else { |
| if (!g1) { |
| g1 = newgroup(); |
| g1->g_next = ghead; |
| ghead = g1; |
| |
| if (hash_add(h, gtrans, |
| sizeof gtrans, g1)) |
| chknull(NULL); |
| } |
| |
| s1->s_index = g1->g_id; |
| s1->s_stack = g1->g_member; |
| g1->g_member = s1; |
| done = 0; |
| } |
| } while (s1 = *sp); |
| |
| xmlHashFree(h, NULL); |
| } |
| } while (!done); |
| |
| /** |
| *** Establish group leaders and remap transitions. |
| **/ |
| |
| startgroup = dfa_states->s_index; |
| |
| for (g1 = ghead; g1; g1 = g1->g_next) |
| for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack) |
| for (t1 = s1->s_backward; t1; t1 = t2) { |
| t2 = t1->t_backnext; |
| link_transition(t1, NULL, g1->g_member); |
| } |
| |
| /** |
| *** Remove redundant states and transitions. |
| **/ |
| |
| for (g1 = ghead; g1; g1 = g1->g_next) { |
| g1->g_member->s_next = (t_state *) NULL; |
| |
| while ((s1 = g1->g_member->s_stack)) { |
| g1->g_member->s_stack = s1->s_stack; |
| |
| for (t1 = s1->s_forward; t1; t1 = t2) { |
| t2 = t1->t_forwnext; |
| unlink_transition(t1); |
| free((char *) t1); |
| } |
| |
| free((char *) s1); |
| } |
| } |
| |
| /** |
| *** Remove group support and relink DFA states. |
| **/ |
| |
| dfa_states = (t_state *) NULL; |
| s2 = (t_state *) NULL; |
| finstates = (t_state *) NULL; |
| |
| while (g1 = ghead) { |
| ghead = g1->g_next; |
| s1 = g1->g_member; |
| |
| if (g1->g_id == startgroup) |
| dfa_states = s1; /* Keep start state first. */ |
| else if (s1->s_final) { /* Then final states. */ |
| s1->s_next = finstates; |
| finstates = s1; |
| } |
| else { /* Finish with non-final states. */ |
| s1->s_next = s2; |
| s2 = s1; |
| } |
| |
| free((char *) g1); |
| } |
| |
| for (dfa_states->s_next = finstates; finstates->s_next;) |
| finstates = finstates->s_next; |
| |
| finstates->s_next = s2; |
| } |
| |
| |
| const char * |
| inttype(unsigned long max) |
| |
| { |
| int i; |
| |
| for (i = 0; max; i++) |
| max >>= 1; |
| |
| if (i > 8 * sizeof(unsigned int)) |
| return "unsigned long"; |
| |
| if (i > 8 * sizeof(unsigned short)) |
| return "unsigned int"; |
| |
| if (i > 8 * sizeof(unsigned char)) |
| return "unsigned short"; |
| |
| return "unsigned char"; |
| } |
| |
| |
| listids(FILE * fp) |
| |
| { |
| unsigned int pos; |
| t_chset * cp; |
| t_symlist * lp; |
| char * srcp; |
| char * dstp; |
| size_t srcc; |
| size_t dstc; |
| char buf[80]; |
| |
| fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n"); |
| pos = 0; |
| |
| for (cp = chset_list; cp; cp = cp->c_next) { |
| if (pos) { |
| fprintf(fp, "\n"); |
| pos = 0; |
| } |
| |
| if (!cp->c_names) |
| continue; |
| |
| pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid, |
| iconv_open_error(cp->c_fromUTF8)? ' ': 'X'); |
| |
| for (lp = cp->c_names; lp; lp = lp->l_next) { |
| srcp = (char *) lp->l_symbol; |
| srcc = strlen(srcp); |
| dstp = buf; |
| dstc = sizeof buf; |
| iconv(utf82job, &srcp, &srcc, &dstp, &dstc); |
| srcc = dstp - buf; |
| |
| if (pos + srcc > 79) { |
| fprintf(fp, "\n***%22c", ' '); |
| pos = 25; |
| } |
| |
| pos += fprintf(fp, " %.*s", srcc, buf); |
| } |
| } |
| |
| if (pos) |
| fprintf(fp, "\n"); |
| |
| fprintf(fp, "**/\n\n"); |
| } |
| |
| |
| void |
| generate(FILE * fp) |
| |
| { |
| unsigned int nstates; |
| unsigned int ntrans; |
| unsigned int maxfinal; |
| t_state * s; |
| t_transition * t; |
| unsigned int i; |
| unsigned int pos; |
| char * ns; |
| |
| /** |
| *** Assign indexes to states and transitions. |
| **/ |
| |
| nstates = 0; |
| ntrans = 0; |
| maxfinal = 0; |
| |
| for (s = dfa_states; s; s = s->s_next) { |
| s->s_index = nstates++; |
| |
| if (s->s_final) |
| maxfinal = nstates; |
| |
| for (t = s->s_forward; t; t = t->t_forwnext) |
| t->t_index = ntrans++; |
| } |
| |
| fprintf(fp, |
| "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n", |
| nstates, maxfinal, ntrans); |
| fprintf(stderr, "%u states, %u finals, %u transitions.\n", |
| nstates, maxfinal, ntrans); |
| |
| /** |
| *** Generate types. |
| **/ |
| |
| fprintf(fp, "typedef unsigned short t_ccsid;\n"); |
| fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates)); |
| fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans)); |
| |
| /** |
| *** Generate first transition index for each state. |
| **/ |
| |
| fprintf(fp, "static t_transrange trans_array[] = {\n"); |
| pos = 0; |
| ntrans = 0; |
| |
| for (s = dfa_states; s; s = s->s_next) { |
| pos += fprintf(fp, " %u,", ntrans); |
| |
| if (pos > 72) { |
| fprintf(fp, "\n"); |
| pos = 0; |
| } |
| |
| for (t = s->s_forward; t; t = t->t_forwnext) |
| ntrans++; |
| } |
| |
| fprintf(fp, " %u\n};\n\n", ntrans); |
| |
| /** |
| *** Generate final state info. |
| **/ |
| |
| fprintf(fp, "static t_ccsid final_array[] = {\n"); |
| pos = 0; |
| ns =""; |
| i = 0; |
| |
| for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) { |
| pos += fprintf(fp, "%s", ns); |
| ns = ","; |
| |
| if (pos > 72) { |
| fprintf(fp, "\n"); |
| pos = 0; |
| } |
| |
| pos += fprintf(fp, " %u", |
| s->s_final? s->s_final->c_ccsid + 1: 0); |
| } |
| |
| fprintf(fp, "\n};\n\n"); |
| |
| /** |
| *** Generate goto table. |
| **/ |
| |
| fprintf(fp, "static t_staterange goto_array[] = {\n"); |
| pos = 0; |
| |
| for (s = dfa_states; s; s = s->s_next) |
| for (t = s->s_forward; t; t = t->t_forwnext) { |
| pos += fprintf(fp, " %u,", t->t_to->s_index); |
| |
| if (pos > 72) { |
| fprintf(fp, "\n"); |
| pos = 0; |
| } |
| } |
| |
| fprintf(fp, " %u\n};\n\n", nstates); |
| |
| /** |
| *** Generate transition label table. |
| **/ |
| |
| fprintf(fp, "static unsigned char label_array[] = {\n"); |
| pos = 0; |
| ns =""; |
| |
| for (s = dfa_states; s; s = s->s_next) |
| for (t = s->s_forward; t; t = t->t_forwnext) { |
| pos += fprintf(fp, "%s", ns); |
| ns = ","; |
| |
| if (pos > 72) { |
| fprintf(fp, "\n"); |
| pos = 0; |
| } |
| |
| pos += fprintf(fp, " 0x%02X", t->t_token); |
| } |
| |
| fprintf(fp, "\n};\n", nstates); |
| } |
| |
| |
| main(argc, argv) |
| int argc; |
| char * * argv; |
| |
| { |
| FILE * fp; |
| t_chset * csp; |
| char symbuf[20]; |
| |
| chset_list = (t_chset *) NULL; |
| initial_state = newstate(); |
| job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0); |
| utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0); |
| |
| if (argc != 4) { |
| fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv); |
| fprintf(stderr, "<iana-character-set file> <output file>\n"); |
| exit(1); |
| } |
| |
| /** |
| *** Read CCSID/MIBenum associations. Define special names. |
| **/ |
| |
| read_assocs(argv[1]); |
| |
| /** |
| *** Read character set names and establish the case-independent |
| *** name DFA in all possible CCSIDs. |
| **/ |
| |
| read_iana(argv[2]); |
| |
| /** |
| *** Build DFA from NFA. |
| **/ |
| |
| builddfa(); |
| |
| /** |
| *** Delete NFA. |
| **/ |
| |
| deletenfa(); |
| |
| /** |
| *** Minimize the DFA state count. |
| **/ |
| |
| optimizedfa(); |
| |
| /** |
| *** Generate the table. |
| **/ |
| |
| fp = fopen(argv[3], "w+"); |
| |
| if (!fp) { |
| perror(argv[3]); |
| exit(1); |
| } |
| |
| fprintf(fp, "/**\n"); |
| fprintf(fp, "*** Character set names table.\n"); |
| fprintf(fp, "*** Generated by program BLDCSNDFA from"); |
| fprintf(fp, " IANA character set assignment file\n"); |
| fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n"); |
| fprintf(fp, "*** *** Do not edit by hand ***\n"); |
| fprintf(fp, "**/\n\n"); |
| listids(fp); |
| generate(fp); |
| |
| if (ferror(fp)) { |
| perror(argv[3]); |
| fclose(fp); |
| exit(1); |
| } |
| |
| fclose(fp); |
| iconv_close(job2utf8); |
| iconv_close(utf82job); |
| exit(0); |
| } |