Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 1 | /* unicode character name tables */ |
| 2 | /* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ |
| 3 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 4 | #include "Python.h" |
Guido van Rossum | 4f4b799 | 2000-06-29 00:06:39 +0000 | [diff] [blame] | 5 | #include "ucnhash.h" |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 6 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 7 | /* data file generated by Tools/unicode/makeunicodedata.py */ |
| 8 | #include "unicodename_db.h" |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 9 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 10 | /* -------------------------------------------------------------------- */ |
| 11 | /* database code (cut and pasted from the unidb package) */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 12 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 13 | static unsigned long |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 14 | gethash(const char *s, int len, int scale) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 15 | { |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 16 | int i; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 17 | unsigned long h = 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 18 | unsigned long ix; |
| 19 | for (i = 0; i < len; i++) { |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 20 | h = (h * scale) + (unsigned char) toupper(s[i]); |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 21 | ix = h & 0xff000000; |
| 22 | if (ix) |
| 23 | h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 24 | } |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 25 | return h; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 26 | } |
| 27 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 28 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 29 | getname(Py_UCS4 code, char* buffer, int buflen) |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 30 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 31 | int offset; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 32 | int i; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 33 | int word; |
| 34 | unsigned char* w; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 35 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 36 | if (code < 0 || code >= 65536) |
| 37 | return 0; |
| 38 | |
| 39 | /* get offset into phrasebook */ |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 40 | offset = phrasebook_offset1[(code>>phrasebook_shift)]; |
| 41 | offset = phrasebook_offset2[(offset<<phrasebook_shift)+ |
| 42 | (code&((1<<phrasebook_shift)-1))]; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 43 | if (!offset) |
| 44 | return 0; |
| 45 | |
| 46 | i = 0; |
| 47 | |
| 48 | for (;;) { |
| 49 | /* get word index */ |
| 50 | if (phrasebook[offset] & 128) { |
| 51 | word = phrasebook[offset] & 127; |
| 52 | offset++; |
| 53 | } else { |
| 54 | word = (phrasebook[offset]<<8) + phrasebook[offset+1]; |
| 55 | offset+=2; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 56 | } |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 57 | if (i) { |
| 58 | if (i > buflen) |
| 59 | return 0; /* buffer overflow */ |
| 60 | buffer[i++] = ' '; |
| 61 | } |
| 62 | /* copy word string from lexicon. the last character in the |
| 63 | word has bit 7 set. the last word in a string ends with |
| 64 | 0x80 */ |
| 65 | w = lexicon + lexicon_offset[word]; |
| 66 | while (*w < 128) { |
| 67 | if (i >= buflen) |
| 68 | return 0; /* buffer overflow */ |
| 69 | buffer[i++] = *w++; |
| 70 | } |
| 71 | if (i >= buflen) |
| 72 | return 0; /* buffer overflow */ |
| 73 | buffer[i++] = *w & 127; |
| 74 | if (*w == 128) |
| 75 | break; /* end of word */ |
| 76 | } |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 77 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 78 | return 1; |
| 79 | } |
| 80 | |
| 81 | static int |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 82 | cmpname(int code, const char* name, int namelen) |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 83 | { |
| 84 | /* check if code corresponds to the given name */ |
| 85 | int i; |
| 86 | char buffer[NAME_MAXLEN]; |
| 87 | if (!getname(code, buffer, sizeof(buffer))) |
| 88 | return 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 89 | for (i = 0; i < namelen; i++) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 90 | if (toupper(name[i]) != buffer[i]) |
| 91 | return 0; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 92 | } |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 93 | return buffer[namelen] == '\0'; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 94 | } |
| 95 | |
| 96 | static int |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 97 | getcode(const char* name, int namelen, Py_UCS4* code) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 98 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 99 | unsigned int h, v; |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 100 | unsigned int mask = code_size-1; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 101 | unsigned int i, incr; |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 102 | |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 103 | /* the following is the same as python's dictionary lookup, with |
| 104 | only minor changes. see the makeunicodedata script for more |
| 105 | details */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 106 | |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 107 | h = (unsigned int) gethash(name, namelen, code_magic); |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 108 | i = (~h) & mask; |
| 109 | v = code_hash[i]; |
| 110 | if (!v) |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 111 | return 0; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 112 | if (cmpname(v, name, namelen)) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 113 | *code = v; |
| 114 | return 1; |
| 115 | } |
| 116 | incr = (h ^ (h >> 3)) & mask; |
| 117 | if (!incr) |
| 118 | incr = mask; |
| 119 | for (;;) { |
| 120 | i = (i + incr) & mask; |
| 121 | v = code_hash[i]; |
| 122 | if (!v) |
| 123 | return -1; |
Fredrik Lundh | 7c1e4bb | 2001-01-19 19:45:02 +0000 | [diff] [blame] | 124 | if (cmpname(v, name, namelen)) { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 125 | *code = v; |
| 126 | return 1; |
| 127 | } |
| 128 | incr = incr << 1; |
| 129 | if (incr > mask) |
Fredrik Lundh | 9e9bcda | 2001-01-21 17:01:31 +0000 | [diff] [blame^] | 130 | incr = incr ^ code_poly; |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 131 | } |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 132 | } |
| 133 | |
| 134 | static const _PyUnicode_Name_CAPI hashAPI = |
| 135 | { |
| 136 | sizeof(_PyUnicode_Name_CAPI), |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 137 | getname, |
| 138 | getcode |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 139 | }; |
| 140 | |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 141 | /* -------------------------------------------------------------------- */ |
| 142 | /* Python bindings */ |
| 143 | |
| 144 | static PyObject * |
| 145 | ucnhash_getname(PyObject* self, PyObject* args) |
| 146 | { |
Fredrik Lundh | 95f1e6f | 2001-01-19 11:52:33 +0000 | [diff] [blame] | 147 | char name[NAME_MAXLEN]; |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 148 | |
| 149 | int code; |
| 150 | if (!PyArg_ParseTuple(args, "i", &code)) |
| 151 | return NULL; |
| 152 | |
| 153 | if (!getname((Py_UCS4) code, name, sizeof(name))) { |
| 154 | PyErr_SetString(PyExc_ValueError, "undefined character code"); |
| 155 | return NULL; |
| 156 | } |
| 157 | |
| 158 | return Py_BuildValue("s", name); |
| 159 | } |
| 160 | |
| 161 | static PyObject * |
| 162 | ucnhash_getcode(PyObject* self, PyObject* args) |
| 163 | { |
| 164 | Py_UCS4 code; |
| 165 | |
| 166 | char* name; |
| 167 | int namelen; |
| 168 | if (!PyArg_ParseTuple(args, "s#", &name, &namelen)) |
| 169 | return NULL; |
| 170 | |
| 171 | if (!getcode(name, namelen, &code)) { |
| 172 | PyErr_SetString(PyExc_ValueError, "undefined character name"); |
| 173 | return NULL; |
| 174 | } |
| 175 | |
| 176 | return Py_BuildValue("i", code); |
| 177 | } |
| 178 | |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 179 | static |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 180 | PyMethodDef ucnhash_methods[] = |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 181 | { |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 182 | {"getname", ucnhash_getname, 1}, |
| 183 | {"getcode", ucnhash_getcode, 1}, |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 184 | {NULL, NULL}, |
| 185 | }; |
| 186 | |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 187 | static char *ucnhash_docstring = "ucnhash hash function module"; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 188 | |
| 189 | |
| 190 | /* Create PyMethodObjects and register them in the module's dict */ |
| 191 | DL_EXPORT(void) |
| 192 | initucnhash(void) |
| 193 | { |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 194 | PyObject *m, *d, *v; |
| 195 | |
| 196 | m = Py_InitModule4( |
| 197 | "ucnhash", /* Module name */ |
| 198 | ucnhash_methods, /* Method list */ |
| 199 | ucnhash_docstring, /* Module doc-string */ |
| 200 | (PyObject *)NULL, /* always pass this as *self */ |
| 201 | PYTHON_API_VERSION); /* API Version */ |
| 202 | if (!m) |
| 203 | return; |
| 204 | |
| 205 | d = PyModule_GetDict(m); |
| 206 | if (!d) |
| 207 | return; |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 208 | |
| 209 | /* Export C API */ |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 210 | v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |
| 211 | PyDict_SetItemString(d, "Unicode_Names_CAPI", v); |
| 212 | Py_XDECREF(v); |
Marc-André Lemburg | 8fb8748 | 2000-06-28 16:38:56 +0000 | [diff] [blame] | 213 | } |