blob: bdcdab1aec54680707e5ef2feb631e8df3e05bbd [file] [log] [blame]
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00001/* unicode character name tables */
2/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
3
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00004#include "Python.h"
Guido van Rossum4f4b7992000-06-29 00:06:39 +00005#include "ucnhash.h"
Marc-André Lemburg8fb87482000-06-28 16:38:56 +00006
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00007/* data file generated by Tools/unicode/makeunicodedata.py */
8#include "unicodename_db.h"
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000010/* -------------------------------------------------------------------- */
11/* database code (cut and pasted from the unidb package) */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000013static unsigned long
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014gethash(const char *s, int len, int scale)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000015{
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000016 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000017 unsigned long h = 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000018 unsigned long ix;
19 for (i = 0; i < len; i++) {
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000020 h = (h * scale) + (unsigned char) toupper(s[i]);
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000021 ix = h & 0xff000000;
22 if (ix)
23 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000024 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000025 return h;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000026}
27
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000028static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000029getname(Py_UCS4 code, char* buffer, int buflen)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000030{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000031 int offset;
Fredrik Lundhee865c62001-01-19 11:00:42 +000032 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000033 int word;
34 unsigned char* w;
Fredrik Lundhee865c62001-01-19 11:00:42 +000035
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000036 if (code < 0 || code >= 65536)
37 return 0;
38
39 /* get offset into phrasebook */
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000040 offset = phrasebook_offset1[(code>>phrasebook_shift)];
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000041 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000042 (code&((1<<phrasebook_shift)-1))];
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000043 if (!offset)
44 return 0;
45
46 i = 0;
47
48 for (;;) {
49 /* get word index */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 word = phrasebook[offset] - phrasebook_short;
51 if (word >= 0) {
52 word = (word << 8) + phrasebook[offset+1];
53 offset += 2;
54 } else
55 word = phrasebook[offset++];
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000056 if (i) {
57 if (i > buflen)
58 return 0; /* buffer overflow */
59 buffer[i++] = ' ';
60 }
61 /* copy word string from lexicon. the last character in the
62 word has bit 7 set. the last word in a string ends with
63 0x80 */
64 w = lexicon + lexicon_offset[word];
65 while (*w < 128) {
66 if (i >= buflen)
67 return 0; /* buffer overflow */
68 buffer[i++] = *w++;
69 }
70 if (i >= buflen)
71 return 0; /* buffer overflow */
72 buffer[i++] = *w & 127;
73 if (*w == 128)
74 break; /* end of word */
75 }
Fredrik Lundhee865c62001-01-19 11:00:42 +000076
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000077 return 1;
78}
79
80static int
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000081cmpname(int code, const char* name, int namelen)
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000082{
83 /* check if code corresponds to the given name */
84 int i;
85 char buffer[NAME_MAXLEN];
86 if (!getname(code, buffer, sizeof(buffer)))
87 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000088 for (i = 0; i < namelen; i++) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000089 if (toupper(name[i]) != buffer[i])
90 return 0;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000091 }
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000092 return buffer[namelen] == '\0';
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000093}
94
95static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000096getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000097{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000098 unsigned int h, v;
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 unsigned int mask = code_size-1;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000100 unsigned int i, incr;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000101
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000102 /* the following is the same as python's dictionary lookup, with
103 only minor changes. see the makeunicodedata script for more
104 details */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000105
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106 h = (unsigned int) gethash(name, namelen, code_magic);
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000107 i = (~h) & mask;
108 v = code_hash[i];
109 if (!v)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000110 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000111 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000112 *code = v;
113 return 1;
114 }
115 incr = (h ^ (h >> 3)) & mask;
116 if (!incr)
117 incr = mask;
118 for (;;) {
119 i = (i + incr) & mask;
120 v = code_hash[i];
121 if (!v)
122 return -1;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000123 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000124 *code = v;
125 return 1;
126 }
127 incr = incr << 1;
128 if (incr > mask)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000129 incr = incr ^ code_poly;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000130 }
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000131}
132
133static const _PyUnicode_Name_CAPI hashAPI =
134{
135 sizeof(_PyUnicode_Name_CAPI),
Fredrik Lundhee865c62001-01-19 11:00:42 +0000136 getname,
137 getcode
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000138};
139
Fredrik Lundhee865c62001-01-19 11:00:42 +0000140/* -------------------------------------------------------------------- */
141/* Python bindings */
142
143static PyObject *
144ucnhash_getname(PyObject* self, PyObject* args)
145{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000146 char name[NAME_MAXLEN];
Fredrik Lundhee865c62001-01-19 11:00:42 +0000147
148 int code;
149 if (!PyArg_ParseTuple(args, "i", &code))
150 return NULL;
151
152 if (!getname((Py_UCS4) code, name, sizeof(name))) {
153 PyErr_SetString(PyExc_ValueError, "undefined character code");
154 return NULL;
155 }
156
157 return Py_BuildValue("s", name);
158}
159
160static PyObject *
161ucnhash_getcode(PyObject* self, PyObject* args)
162{
163 Py_UCS4 code;
164
165 char* name;
166 int namelen;
167 if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
168 return NULL;
169
170 if (!getcode(name, namelen, &code)) {
171 PyErr_SetString(PyExc_ValueError, "undefined character name");
172 return NULL;
173 }
174
175 return Py_BuildValue("i", code);
176}
177
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000178static
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000179PyMethodDef ucnhash_methods[] =
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000180{
Fredrik Lundhee865c62001-01-19 11:00:42 +0000181 {"getname", ucnhash_getname, 1},
182 {"getcode", ucnhash_getcode, 1},
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000183 {NULL, NULL},
184};
185
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000186static char *ucnhash_docstring = "ucnhash hash function module";
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000187
188
189/* Create PyMethodObjects and register them in the module's dict */
190DL_EXPORT(void)
191initucnhash(void)
192{
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000193 PyObject *m, *d, *v;
194
195 m = Py_InitModule4(
196 "ucnhash", /* Module name */
197 ucnhash_methods, /* Method list */
198 ucnhash_docstring, /* Module doc-string */
199 (PyObject *)NULL, /* always pass this as *self */
200 PYTHON_API_VERSION); /* API Version */
201 if (!m)
202 return;
203
204 d = PyModule_GetDict(m);
205 if (!d)
206 return;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000207
208 /* Export C API */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000209 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
210 PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
211 Py_XDECREF(v);
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000212}