| /* unicode character name tables */ |
| /* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ |
| |
| #include "Python.h" |
| #include "ucnhash.h" |
| |
| /* data file generated by Tools/unicode/makeunicodedata.py */ |
| #include "unicodename_db.h" |
| |
| /* -------------------------------------------------------------------- */ |
| /* database code (cut and pasted from the unidb package) */ |
| |
| static unsigned long |
| gethash(const char *s, int len, int scale) |
| { |
| int i; |
| unsigned long h = 0; |
| unsigned long ix; |
| for (i = 0; i < len; i++) { |
| h = (h * scale) + (unsigned char) toupper(s[i]); |
| ix = h & 0xff000000; |
| if (ix) |
| h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; |
| } |
| return h; |
| } |
| |
| static int |
| getname(Py_UCS4 code, char* buffer, int buflen) |
| { |
| int offset; |
| int i; |
| int word; |
| unsigned char* w; |
| |
| if (code < 0 || code >= 65536) |
| return 0; |
| |
| /* get offset into phrasebook */ |
| offset = phrasebook_offset1[(code>>phrasebook_shift)]; |
| offset = phrasebook_offset2[(offset<<phrasebook_shift) + |
| (code&((1<<phrasebook_shift)-1))]; |
| if (!offset) |
| return 0; |
| |
| i = 0; |
| |
| for (;;) { |
| /* get word index */ |
| word = phrasebook[offset] - phrasebook_short; |
| if (word >= 0) { |
| word = (word << 8) + phrasebook[offset+1]; |
| offset += 2; |
| } else |
| word = phrasebook[offset++]; |
| if (i) { |
| if (i > buflen) |
| return 0; /* buffer overflow */ |
| buffer[i++] = ' '; |
| } |
| /* copy word string from lexicon. the last character in the |
| word has bit 7 set. the last word in a string ends with |
| 0x80 */ |
| w = lexicon + lexicon_offset[word]; |
| while (*w < 128) { |
| if (i >= buflen) |
| return 0; /* buffer overflow */ |
| buffer[i++] = *w++; |
| } |
| if (i >= buflen) |
| return 0; /* buffer overflow */ |
| buffer[i++] = *w & 127; |
| if (*w == 128) |
| break; /* end of word */ |
| } |
| |
| return 1; |
| } |
| |
| static int |
| cmpname(int code, const char* name, int namelen) |
| { |
| /* check if code corresponds to the given name */ |
| int i; |
| char buffer[NAME_MAXLEN]; |
| if (!getname(code, buffer, sizeof(buffer))) |
| return 0; |
| for (i = 0; i < namelen; i++) { |
| if (toupper(name[i]) != buffer[i]) |
| return 0; |
| } |
| return buffer[namelen] == '\0'; |
| } |
| |
| static int |
| getcode(const char* name, int namelen, Py_UCS4* code) |
| { |
| unsigned int h, v; |
| unsigned int mask = code_size-1; |
| unsigned int i, incr; |
| |
| /* the following is the same as python's dictionary lookup, with |
| only minor changes. see the makeunicodedata script for more |
| details */ |
| |
| h = (unsigned int) gethash(name, namelen, code_magic); |
| i = (~h) & mask; |
| v = code_hash[i]; |
| if (!v) |
| return 0; |
| if (cmpname(v, name, namelen)) { |
| *code = v; |
| return 1; |
| } |
| incr = (h ^ (h >> 3)) & mask; |
| if (!incr) |
| incr = mask; |
| for (;;) { |
| i = (i + incr) & mask; |
| v = code_hash[i]; |
| if (!v) |
| return -1; |
| if (cmpname(v, name, namelen)) { |
| *code = v; |
| return 1; |
| } |
| incr = incr << 1; |
| if (incr > mask) |
| incr = incr ^ code_poly; |
| } |
| } |
| |
| static const _PyUnicode_Name_CAPI hashAPI = |
| { |
| sizeof(_PyUnicode_Name_CAPI), |
| getname, |
| getcode |
| }; |
| |
| /* -------------------------------------------------------------------- */ |
| /* Python bindings */ |
| |
| static PyObject * |
| ucnhash_getname(PyObject* self, PyObject* args) |
| { |
| char name[NAME_MAXLEN]; |
| |
| int code; |
| if (!PyArg_ParseTuple(args, "i", &code)) |
| return NULL; |
| |
| if (!getname((Py_UCS4) code, name, sizeof(name))) { |
| PyErr_SetString(PyExc_ValueError, "undefined character code"); |
| return NULL; |
| } |
| |
| return Py_BuildValue("s", name); |
| } |
| |
| static PyObject * |
| ucnhash_getcode(PyObject* self, PyObject* args) |
| { |
| Py_UCS4 code; |
| |
| char* name; |
| int namelen; |
| if (!PyArg_ParseTuple(args, "s#", &name, &namelen)) |
| return NULL; |
| |
| if (!getcode(name, namelen, &code)) { |
| PyErr_SetString(PyExc_ValueError, "undefined character name"); |
| return NULL; |
| } |
| |
| return Py_BuildValue("i", code); |
| } |
| |
| static |
| PyMethodDef ucnhash_methods[] = |
| { |
| {"getname", ucnhash_getname, 1}, |
| {"getcode", ucnhash_getcode, 1}, |
| {NULL, NULL}, |
| }; |
| |
| static char *ucnhash_docstring = "ucnhash hash function module"; |
| |
| |
| /* Create PyMethodObjects and register them in the module's dict */ |
| DL_EXPORT(void) |
| initucnhash(void) |
| { |
| PyObject *m, *d, *v; |
| |
| m = Py_InitModule4( |
| "ucnhash", /* Module name */ |
| ucnhash_methods, /* Method list */ |
| ucnhash_docstring, /* Module doc-string */ |
| (PyObject *)NULL, /* always pass this as *self */ |
| PYTHON_API_VERSION); /* API Version */ |
| if (!m) |
| return; |
| |
| d = PyModule_GetDict(m); |
| if (!d) |
| return; |
| |
| /* Export C API */ |
| v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |
| PyDict_SetItemString(d, "Unicode_Names_CAPI", v); |
| Py_XDECREF(v); |
| } |