blob: cc2a38ea8ca181fe4ac807ff8994a155c3ffbea8 [file] [log] [blame]
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00001/* unicode character name tables */
2/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
3
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00004#include "Python.h"
Guido van Rossum4f4b7992000-06-29 00:06:39 +00005#include "ucnhash.h"
Marc-André Lemburg8fb87482000-06-28 16:38:56 +00006
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00007/* data file generated by Tools/unicode/makeunicodedata.py */
8#include "unicodename_db.h"
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000010/* -------------------------------------------------------------------- */
11/* database code (cut and pasted from the unidb package) */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000013static unsigned long
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000014gethash(const char *s)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000015{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000016 unsigned long h = 0;
17 unsigned long i;
18 while (*s) {
19 /* magic value 47 was chosen to minimize the number
20 of collisions for the uninames dataset. see the
21 makeunicodedata script for more background */
22 h = (h * 47) + (unsigned char) toupper(*s++);
23 i = h & 0xff000000;
24 if (i)
25 h = (h ^ ((i>>24) & 0xff)) & 0x00ffffff;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000026 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000027 return h;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000028}
29
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000030static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000031getname(Py_UCS4 code, char* buffer, int buflen)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000032{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000033 int offset;
Fredrik Lundhee865c62001-01-19 11:00:42 +000034 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000035 int word;
36 unsigned char* w;
Fredrik Lundhee865c62001-01-19 11:00:42 +000037
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000038 if (code < 0 || code >= 65536)
39 return 0;
40
41 /* get offset into phrasebook */
42 offset = phrasebook_offset1[(code>>SHIFT)];
43 offset = phrasebook_offset2[(offset<<SHIFT)+(code&((1<<SHIFT)-1))];
44 if (!offset)
45 return 0;
46
47 i = 0;
48
49 for (;;) {
50 /* get word index */
51 if (phrasebook[offset] & 128) {
52 word = phrasebook[offset] & 127;
53 offset++;
54 } else {
55 word = (phrasebook[offset]<<8) + phrasebook[offset+1];
56 offset+=2;
Fredrik Lundhee865c62001-01-19 11:00:42 +000057 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000058 if (i) {
59 if (i > buflen)
60 return 0; /* buffer overflow */
61 buffer[i++] = ' ';
62 }
63 /* copy word string from lexicon. the last character in the
64 word has bit 7 set. the last word in a string ends with
65 0x80 */
66 w = lexicon + lexicon_offset[word];
67 while (*w < 128) {
68 if (i >= buflen)
69 return 0; /* buffer overflow */
70 buffer[i++] = *w++;
71 }
72 if (i >= buflen)
73 return 0; /* buffer overflow */
74 buffer[i++] = *w & 127;
75 if (*w == 128)
76 break; /* end of word */
77 }
Fredrik Lundhee865c62001-01-19 11:00:42 +000078
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000079 return 1;
80}
81
82static int
83cmpname(int code, const char* name)
84{
85 /* check if code corresponds to the given name */
86 int i;
87 char buffer[NAME_MAXLEN];
88 if (!getname(code, buffer, sizeof(buffer)))
89 return 0;
90 i = 0;
91 for (;;) {
92 if (toupper(name[i]) != buffer[i])
93 return 0;
94 if (!name[i] || !buffer[i])
95 return 1;
96 i++;
97 }
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000098}
99
100static int
Fredrik Lundhee865c62001-01-19 11:00:42 +0000101getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000102{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000103 unsigned int h, v;
104 unsigned int mask = CODE_SIZE-1;
105 unsigned int i, incr;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000106
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000107 /* the following is the same as python's dictionary lookup, with
108 only minor changes. see the makeunicodedata script for more
109 details */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000110
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000111 h = (unsigned int) gethash(name);
112 i = (~h) & mask;
113 v = code_hash[i];
114 if (!v)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000115 return 0;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000116 if (cmpname(v, name)) {
117 *code = v;
118 return 1;
119 }
120 incr = (h ^ (h >> 3)) & mask;
121 if (!incr)
122 incr = mask;
123 for (;;) {
124 i = (i + incr) & mask;
125 v = code_hash[i];
126 if (!v)
127 return -1;
128 if (cmpname(v, name)) {
129 *code = v;
130 return 1;
131 }
132 incr = incr << 1;
133 if (incr > mask)
134 incr = incr ^ CODE_POLY;
135 }
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000136}
137
138static const _PyUnicode_Name_CAPI hashAPI =
139{
140 sizeof(_PyUnicode_Name_CAPI),
Fredrik Lundhee865c62001-01-19 11:00:42 +0000141 getname,
142 getcode
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000143};
144
Fredrik Lundhee865c62001-01-19 11:00:42 +0000145/* -------------------------------------------------------------------- */
146/* Python bindings */
147
148static PyObject *
149ucnhash_getname(PyObject* self, PyObject* args)
150{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000151 char name[NAME_MAXLEN];
Fredrik Lundhee865c62001-01-19 11:00:42 +0000152
153 int code;
154 if (!PyArg_ParseTuple(args, "i", &code))
155 return NULL;
156
157 if (!getname((Py_UCS4) code, name, sizeof(name))) {
158 PyErr_SetString(PyExc_ValueError, "undefined character code");
159 return NULL;
160 }
161
162 return Py_BuildValue("s", name);
163}
164
165static PyObject *
166ucnhash_getcode(PyObject* self, PyObject* args)
167{
168 Py_UCS4 code;
169
170 char* name;
171 int namelen;
172 if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
173 return NULL;
174
175 if (!getcode(name, namelen, &code)) {
176 PyErr_SetString(PyExc_ValueError, "undefined character name");
177 return NULL;
178 }
179
180 return Py_BuildValue("i", code);
181}
182
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000183static
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000184PyMethodDef ucnhash_methods[] =
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000185{
Fredrik Lundhee865c62001-01-19 11:00:42 +0000186 {"getname", ucnhash_getname, 1},
187 {"getcode", ucnhash_getcode, 1},
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000188 {NULL, NULL},
189};
190
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000191static char *ucnhash_docstring = "ucnhash hash function module";
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000192
193
194/* Create PyMethodObjects and register them in the module's dict */
195DL_EXPORT(void)
196initucnhash(void)
197{
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000198 PyObject *m, *d, *v;
199
200 m = Py_InitModule4(
201 "ucnhash", /* Module name */
202 ucnhash_methods, /* Method list */
203 ucnhash_docstring, /* Module doc-string */
204 (PyObject *)NULL, /* always pass this as *self */
205 PYTHON_API_VERSION); /* API Version */
206 if (!m)
207 return;
208
209 d = PyModule_GetDict(m);
210 if (!d)
211 return;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000212
213 /* Export C API */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000214 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
215 PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
216 Py_XDECREF(v);
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000217}