Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 1 | /* unicode character name tables */ |
| 2 | /* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ |
| 3 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 4 | #include "Python.h" |
Guido van Rossum | 4f4b799 | 2000-06-29 00:06:39 +0000 | [diff] [blame] | 5 | #include "ucnhash.h" |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 6 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 7 | /* data file generated by Tools/unicode/makeunicodedata.py */ |
| 8 | #include "unicodename_db.h" |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 9 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 10 | /* -------------------------------------------------------------------- */ |
| 11 | /* database code (cut and pasted from the unidb package) */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 12 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 13 | static unsigned long |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 14 | gethash(const char *s) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 15 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 16 | unsigned long h = 0; |
| 17 | unsigned long i; |
| 18 | while (*s) { |
| 19 | /* magic value 47 was chosen to minimize the number |
| 20 | of collisions for the uninames dataset. see the |
| 21 | makeunicodedata script for more background */ |
| 22 | h = (h * 47) + (unsigned char) toupper(*s++); |
| 23 | i = h & 0xff000000; |
| 24 | if (i) |
| 25 | h = (h ^ ((i>>24) & 0xff)) & 0x00ffffff; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 26 | } |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 27 | return h; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 28 | } |
| 29 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 30 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 31 | getname(Py_UCS4 code, char* buffer, int buflen) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 32 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 33 | int offset; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 34 | int i; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 35 | int word; |
| 36 | unsigned char* w; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 37 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 38 | if (code < 0 || code >= 65536) |
| 39 | return 0; |
| 40 | |
| 41 | /* get offset into phrasebook */ |
| 42 | offset = phrasebook_offset1[(code>>SHIFT)]; |
| 43 | offset = phrasebook_offset2[(offset<<SHIFT)+(code&((1<<SHIFT)-1))]; |
| 44 | if (!offset) |
| 45 | return 0; |
| 46 | |
| 47 | i = 0; |
| 48 | |
| 49 | for (;;) { |
| 50 | /* get word index */ |
| 51 | if (phrasebook[offset] & 128) { |
| 52 | word = phrasebook[offset] & 127; |
| 53 | offset++; |
| 54 | } else { |
| 55 | word = (phrasebook[offset]<<8) + phrasebook[offset+1]; |
| 56 | offset+=2; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 57 | } |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 58 | if (i) { |
| 59 | if (i > buflen) |
| 60 | return 0; /* buffer overflow */ |
| 61 | buffer[i++] = ' '; |
| 62 | } |
| 63 | /* copy word string from lexicon. the last character in the |
| 64 | word has bit 7 set. the last word in a string ends with |
| 65 | 0x80 */ |
| 66 | w = lexicon + lexicon_offset[word]; |
| 67 | while (*w < 128) { |
| 68 | if (i >= buflen) |
| 69 | return 0; /* buffer overflow */ |
| 70 | buffer[i++] = *w++; |
| 71 | } |
| 72 | if (i >= buflen) |
| 73 | return 0; /* buffer overflow */ |
| 74 | buffer[i++] = *w & 127; |
| 75 | if (*w == 128) |
| 76 | break; /* end of word */ |
| 77 | } |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 78 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 79 | return 1; |
| 80 | } |
| 81 | |
| 82 | static int |
| 83 | cmpname(int code, const char* name) |
| 84 | { |
| 85 | /* check if code corresponds to the given name */ |
| 86 | int i; |
| 87 | char buffer[NAME_MAXLEN]; |
| 88 | if (!getname(code, buffer, sizeof(buffer))) |
| 89 | return 0; |
| 90 | i = 0; |
| 91 | for (;;) { |
| 92 | if (toupper(name[i]) != buffer[i]) |
| 93 | return 0; |
| 94 | if (!name[i] || !buffer[i]) |
| 95 | return 1; |
| 96 | i++; |
| 97 | } |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 98 | } |
| 99 | |
| 100 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 101 | getcode(const char* name, int namelen, Py_UCS4* code) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 102 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 103 | unsigned int h, v; |
| 104 | unsigned int mask = CODE_SIZE-1; |
| 105 | unsigned int i, incr; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 106 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 107 | /* the following is the same as python's dictionary lookup, with |
| 108 | only minor changes. see the makeunicodedata script for more |
| 109 | details */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 110 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 111 | h = (unsigned int) gethash(name); |
| 112 | i = (~h) & mask; |
| 113 | v = code_hash[i]; |
| 114 | if (!v) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 115 | return 0; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 116 | if (cmpname(v, name)) { |
| 117 | *code = v; |
| 118 | return 1; |
| 119 | } |
| 120 | incr = (h ^ (h >> 3)) & mask; |
| 121 | if (!incr) |
| 122 | incr = mask; |
| 123 | for (;;) { |
| 124 | i = (i + incr) & mask; |
| 125 | v = code_hash[i]; |
| 126 | if (!v) |
| 127 | return -1; |
| 128 | if (cmpname(v, name)) { |
| 129 | *code = v; |
| 130 | return 1; |
| 131 | } |
| 132 | incr = incr << 1; |
| 133 | if (incr > mask) |
| 134 | incr = incr ^ CODE_POLY; |
| 135 | } |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 136 | } |
| 137 | |
| 138 | static const _PyUnicode_Name_CAPI hashAPI = |
| 139 | { |
| 140 | sizeof(_PyUnicode_Name_CAPI), |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 141 | getname, |
| 142 | getcode |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 143 | }; |
| 144 | |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 145 | /* -------------------------------------------------------------------- */ |
| 146 | /* Python bindings */ |
| 147 | |
| 148 | static PyObject * |
| 149 | ucnhash_getname(PyObject* self, PyObject* args) |
| 150 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame^] | 151 | char name[NAME_MAXLEN]; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 152 | |
| 153 | int code; |
| 154 | if (!PyArg_ParseTuple(args, "i", &code)) |
| 155 | return NULL; |
| 156 | |
| 157 | if (!getname((Py_UCS4) code, name, sizeof(name))) { |
| 158 | PyErr_SetString(PyExc_ValueError, "undefined character code"); |
| 159 | return NULL; |
| 160 | } |
| 161 | |
| 162 | return Py_BuildValue("s", name); |
| 163 | } |
| 164 | |
| 165 | static PyObject * |
| 166 | ucnhash_getcode(PyObject* self, PyObject* args) |
| 167 | { |
| 168 | Py_UCS4 code; |
| 169 | |
| 170 | char* name; |
| 171 | int namelen; |
| 172 | if (!PyArg_ParseTuple(args, "s#", &name, &namelen)) |
| 173 | return NULL; |
| 174 | |
| 175 | if (!getcode(name, namelen, &code)) { |
| 176 | PyErr_SetString(PyExc_ValueError, "undefined character name"); |
| 177 | return NULL; |
| 178 | } |
| 179 | |
| 180 | return Py_BuildValue("i", code); |
| 181 | } |
| 182 | |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 183 | static |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 184 | PyMethodDef ucnhash_methods[] = |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 185 | { |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 186 | {"getname", ucnhash_getname, 1}, |
| 187 | {"getcode", ucnhash_getcode, 1}, |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 188 | {NULL, NULL}, |
| 189 | }; |
| 190 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 191 | static char *ucnhash_docstring = "ucnhash hash function module"; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 192 | |
| 193 | |
| 194 | /* Create PyMethodObjects and register them in the module's dict */ |
| 195 | DL_EXPORT(void) |
| 196 | initucnhash(void) |
| 197 | { |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 198 | PyObject *m, *d, *v; |
| 199 | |
| 200 | m = Py_InitModule4( |
| 201 | "ucnhash", /* Module name */ |
| 202 | ucnhash_methods, /* Method list */ |
| 203 | ucnhash_docstring, /* Module doc-string */ |
| 204 | (PyObject *)NULL, /* always pass this as *self */ |
| 205 | PYTHON_API_VERSION); /* API Version */ |
| 206 | if (!m) |
| 207 | return; |
| 208 | |
| 209 | d = PyModule_GetDict(m); |
| 210 | if (!d) |
| 211 | return; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 212 | |
| 213 | /* Export C API */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 214 | v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |
| 215 | PyDict_SetItemString(d, "Unicode_Names_CAPI", v); |
| 216 | Py_XDECREF(v); |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 217 | } |