Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 1 | /* unicode character name tables */ |
| 2 | /* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ |
| 3 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 4 | #include "Python.h" |
Guido van Rossum | 4f4b799 | 2000-06-29 00:06:39 +0000 | [diff] [blame] | 5 | #include "ucnhash.h" |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 6 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 7 | /* data file generated by Tools/unicode/makeunicodedata.py */ |
| 8 | #include "unicodename_db.h" |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 9 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 10 | /* -------------------------------------------------------------------- */ |
| 11 | /* database code (cut and pasted from the unidb package) */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 12 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 13 | static unsigned long |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 14 | gethash(const char *s, int len, int scale) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 15 | { |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 16 | int i; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 17 | unsigned long h = 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 18 | unsigned long ix; |
| 19 | for (i = 0; i < len; i++) { |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 20 | h = (h * scale) + (unsigned char) toupper(s[i]); |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 21 | ix = h & 0xff000000; |
| 22 | if (ix) |
| 23 | h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 24 | } |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 25 | return h; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 26 | } |
| 27 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 28 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 29 | getname(Py_UCS4 code, char* buffer, int buflen) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 30 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 31 | int offset; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 32 | int i; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 33 | int word; |
| 34 | unsigned char* w; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 35 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 36 | if (code < 0 || code >= 65536) |
| 37 | return 0; |
| 38 | |
| 39 | /* get offset into phrasebook */ |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 40 | offset = phrasebook_offset1[(code>>phrasebook_shift)]; |
Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 41 | offset = phrasebook_offset2[(offset<<phrasebook_shift) + |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 42 | (code&((1<<phrasebook_shift)-1))]; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 43 | if (!offset) |
| 44 | return 0; |
| 45 | |
| 46 | i = 0; |
| 47 | |
| 48 | for (;;) { |
| 49 | /* get word index */ |
Fredrik Lundh | 7b7dd10 | 2001-01-21 22:41:08 +0000 | [diff] [blame] | 50 | word = phrasebook[offset] - phrasebook_short; |
| 51 | if (word >= 0) { |
| 52 | word = (word << 8) + phrasebook[offset+1]; |
| 53 | offset += 2; |
| 54 | } else |
| 55 | word = phrasebook[offset++]; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 56 | if (i) { |
| 57 | if (i > buflen) |
| 58 | return 0; /* buffer overflow */ |
| 59 | buffer[i++] = ' '; |
| 60 | } |
| 61 | /* copy word string from lexicon. the last character in the |
| 62 | word has bit 7 set. the last word in a string ends with |
| 63 | 0x80 */ |
| 64 | w = lexicon + lexicon_offset[word]; |
| 65 | while (*w < 128) { |
| 66 | if (i >= buflen) |
| 67 | return 0; /* buffer overflow */ |
| 68 | buffer[i++] = *w++; |
| 69 | } |
| 70 | if (i >= buflen) |
| 71 | return 0; /* buffer overflow */ |
| 72 | buffer[i++] = *w & 127; |
| 73 | if (*w == 128) |
| 74 | break; /* end of word */ |
| 75 | } |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 76 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 77 | return 1; |
| 78 | } |
| 79 | |
| 80 | static int |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 81 | cmpname(int code, const char* name, int namelen) |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 82 | { |
| 83 | /* check if code corresponds to the given name */ |
| 84 | int i; |
| 85 | char buffer[NAME_MAXLEN]; |
| 86 | if (!getname(code, buffer, sizeof(buffer))) |
| 87 | return 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 88 | for (i = 0; i < namelen; i++) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 89 | if (toupper(name[i]) != buffer[i]) |
| 90 | return 0; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 91 | } |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 92 | return buffer[namelen] == '\0'; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 93 | } |
| 94 | |
| 95 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 96 | getcode(const char* name, int namelen, Py_UCS4* code) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 97 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 98 | unsigned int h, v; |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 99 | unsigned int mask = code_size-1; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 100 | unsigned int i, incr; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 101 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 102 | /* the following is the same as python's dictionary lookup, with |
| 103 | only minor changes. see the makeunicodedata script for more |
| 104 | details */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 105 | |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 106 | h = (unsigned int) gethash(name, namelen, code_magic); |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 107 | i = (~h) & mask; |
| 108 | v = code_hash[i]; |
| 109 | if (!v) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 110 | return 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 111 | if (cmpname(v, name, namelen)) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 112 | *code = v; |
| 113 | return 1; |
| 114 | } |
| 115 | incr = (h ^ (h >> 3)) & mask; |
| 116 | if (!incr) |
| 117 | incr = mask; |
| 118 | for (;;) { |
| 119 | i = (i + incr) & mask; |
| 120 | v = code_hash[i]; |
| 121 | if (!v) |
| 122 | return -1; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 123 | if (cmpname(v, name, namelen)) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 124 | *code = v; |
| 125 | return 1; |
| 126 | } |
| 127 | incr = incr << 1; |
| 128 | if (incr > mask) |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame] | 129 | incr = incr ^ code_poly; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 130 | } |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 131 | } |
| 132 | |
| 133 | static const _PyUnicode_Name_CAPI hashAPI = |
| 134 | { |
| 135 | sizeof(_PyUnicode_Name_CAPI), |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 136 | getname, |
| 137 | getcode |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 138 | }; |
| 139 | |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 140 | /* -------------------------------------------------------------------- */ |
| 141 | /* Python bindings */ |
| 142 | |
| 143 | static PyObject * |
| 144 | ucnhash_getname(PyObject* self, PyObject* args) |
| 145 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 146 | char name[NAME_MAXLEN]; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 147 | |
| 148 | int code; |
| 149 | if (!PyArg_ParseTuple(args, "i", &code)) |
| 150 | return NULL; |
| 151 | |
| 152 | if (!getname((Py_UCS4) code, name, sizeof(name))) { |
| 153 | PyErr_SetString(PyExc_ValueError, "undefined character code"); |
| 154 | return NULL; |
| 155 | } |
| 156 | |
| 157 | return Py_BuildValue("s", name); |
| 158 | } |
| 159 | |
| 160 | static PyObject * |
| 161 | ucnhash_getcode(PyObject* self, PyObject* args) |
| 162 | { |
| 163 | Py_UCS4 code; |
| 164 | |
| 165 | char* name; |
| 166 | int namelen; |
| 167 | if (!PyArg_ParseTuple(args, "s#", &name, &namelen)) |
| 168 | return NULL; |
| 169 | |
| 170 | if (!getcode(name, namelen, &code)) { |
| 171 | PyErr_SetString(PyExc_ValueError, "undefined character name"); |
| 172 | return NULL; |
| 173 | } |
| 174 | |
| 175 | return Py_BuildValue("i", code); |
| 176 | } |
| 177 | |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 178 | static |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 179 | PyMethodDef ucnhash_methods[] = |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 180 | { |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 181 | {"getname", ucnhash_getname, 1}, |
| 182 | {"getcode", ucnhash_getcode, 1}, |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 183 | {NULL, NULL}, |
| 184 | }; |
| 185 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 186 | static char *ucnhash_docstring = "ucnhash hash function module"; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 187 | |
| 188 | |
| 189 | /* Create PyMethodObjects and register them in the module's dict */ |
| 190 | DL_EXPORT(void) |
| 191 | initucnhash(void) |
| 192 | { |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 193 | PyObject *m, *d, *v; |
| 194 | |
| 195 | m = Py_InitModule4( |
| 196 | "ucnhash", /* Module name */ |
| 197 | ucnhash_methods, /* Method list */ |
| 198 | ucnhash_docstring, /* Module doc-string */ |
| 199 | (PyObject *)NULL, /* always pass this as *self */ |
| 200 | PYTHON_API_VERSION); /* API Version */ |
| 201 | if (!m) |
| 202 | return; |
| 203 | |
| 204 | d = PyModule_GetDict(m); |
| 205 | if (!d) |
| 206 | return; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 207 | |
| 208 | /* Export C API */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 209 | v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |
| 210 | PyDict_SetItemString(d, "Unicode_Names_CAPI", v); |
| 211 | Py_XDECREF(v); |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 212 | } |