blob: d7b3f2ca1472e284fd184df6212319465c970ac6 [file] [log] [blame]
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00001/* unicode character name tables */
2/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
3
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00004#include "Python.h"
Guido van Rossum4f4b7992000-06-29 00:06:39 +00005#include "ucnhash.h"
Marc-André Lemburg8fb87482000-06-28 16:38:56 +00006
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00007/* data file generated by Tools/unicode/makeunicodedata.py */
8#include "unicodename_db.h"
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000010/* -------------------------------------------------------------------- */
11/* database code (cut and pasted from the unidb package) */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000013static unsigned long
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000014gethash(const char *s, int len)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000015{
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000016 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000017 unsigned long h = 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000018 unsigned long ix;
19 for (i = 0; i < len; i++) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000020 /* magic value 47 was chosen to minimize the number
21 of collisions for the uninames dataset. see the
22 makeunicodedata script for more background */
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000023 h = (h * 47) + (unsigned char) toupper(s[i]);
24 ix = h & 0xff000000;
25 if (ix)
26 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000027 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000028 return h;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000029}
30
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000031static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000032getname(Py_UCS4 code, char* buffer, int buflen)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000033{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000034 int offset;
Fredrik Lundhee865c62001-01-19 11:00:42 +000035 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000036 int word;
37 unsigned char* w;
Fredrik Lundhee865c62001-01-19 11:00:42 +000038
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000039 if (code < 0 || code >= 65536)
40 return 0;
41
42 /* get offset into phrasebook */
43 offset = phrasebook_offset1[(code>>SHIFT)];
44 offset = phrasebook_offset2[(offset<<SHIFT)+(code&((1<<SHIFT)-1))];
45 if (!offset)
46 return 0;
47
48 i = 0;
49
50 for (;;) {
51 /* get word index */
52 if (phrasebook[offset] & 128) {
53 word = phrasebook[offset] & 127;
54 offset++;
55 } else {
56 word = (phrasebook[offset]<<8) + phrasebook[offset+1];
57 offset+=2;
Fredrik Lundhee865c62001-01-19 11:00:42 +000058 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000059 if (i) {
60 if (i > buflen)
61 return 0; /* buffer overflow */
62 buffer[i++] = ' ';
63 }
64 /* copy word string from lexicon. the last character in the
65 word has bit 7 set. the last word in a string ends with
66 0x80 */
67 w = lexicon + lexicon_offset[word];
68 while (*w < 128) {
69 if (i >= buflen)
70 return 0; /* buffer overflow */
71 buffer[i++] = *w++;
72 }
73 if (i >= buflen)
74 return 0; /* buffer overflow */
75 buffer[i++] = *w & 127;
76 if (*w == 128)
77 break; /* end of word */
78 }
Fredrik Lundhee865c62001-01-19 11:00:42 +000079
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000080 return 1;
81}
82
83static int
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000084cmpname(int code, const char* name, int namelen)
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000085{
86 /* check if code corresponds to the given name */
87 int i;
88 char buffer[NAME_MAXLEN];
89 if (!getname(code, buffer, sizeof(buffer)))
90 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000091 for (i = 0; i < namelen; i++) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000092 if (toupper(name[i]) != buffer[i])
93 return 0;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000094 }
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000095 return buffer[namelen] == '\0';
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000096}
97
98static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000099getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000100{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000101 unsigned int h, v;
102 unsigned int mask = CODE_SIZE-1;
103 unsigned int i, incr;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000104
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000105 /* the following is the same as python's dictionary lookup, with
106 only minor changes. see the makeunicodedata script for more
107 details */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000108
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000109 h = (unsigned int) gethash(name, namelen);
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000110 i = (~h) & mask;
111 v = code_hash[i];
112 if (!v)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000113 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000114 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000115 *code = v;
116 return 1;
117 }
118 incr = (h ^ (h >> 3)) & mask;
119 if (!incr)
120 incr = mask;
121 for (;;) {
122 i = (i + incr) & mask;
123 v = code_hash[i];
124 if (!v)
125 return -1;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000126 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000127 *code = v;
128 return 1;
129 }
130 incr = incr << 1;
131 if (incr > mask)
132 incr = incr ^ CODE_POLY;
133 }
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000134}
135
136static const _PyUnicode_Name_CAPI hashAPI =
137{
138 sizeof(_PyUnicode_Name_CAPI),
Fredrik Lundhee865c62001-01-19 11:00:42 +0000139 getname,
140 getcode
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000141};
142
Fredrik Lundhee865c62001-01-19 11:00:42 +0000143/* -------------------------------------------------------------------- */
144/* Python bindings */
145
146static PyObject *
147ucnhash_getname(PyObject* self, PyObject* args)
148{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000149 char name[NAME_MAXLEN];
Fredrik Lundhee865c62001-01-19 11:00:42 +0000150
151 int code;
152 if (!PyArg_ParseTuple(args, "i", &code))
153 return NULL;
154
155 if (!getname((Py_UCS4) code, name, sizeof(name))) {
156 PyErr_SetString(PyExc_ValueError, "undefined character code");
157 return NULL;
158 }
159
160 return Py_BuildValue("s", name);
161}
162
163static PyObject *
164ucnhash_getcode(PyObject* self, PyObject* args)
165{
166 Py_UCS4 code;
167
168 char* name;
169 int namelen;
170 if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
171 return NULL;
172
173 if (!getcode(name, namelen, &code)) {
174 PyErr_SetString(PyExc_ValueError, "undefined character name");
175 return NULL;
176 }
177
178 return Py_BuildValue("i", code);
179}
180
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000181static
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000182PyMethodDef ucnhash_methods[] =
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000183{
Fredrik Lundhee865c62001-01-19 11:00:42 +0000184 {"getname", ucnhash_getname, 1},
185 {"getcode", ucnhash_getcode, 1},
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000186 {NULL, NULL},
187};
188
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000189static char *ucnhash_docstring = "ucnhash hash function module";
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000190
191
192/* Create PyMethodObjects and register them in the module's dict */
193DL_EXPORT(void)
194initucnhash(void)
195{
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000196 PyObject *m, *d, *v;
197
198 m = Py_InitModule4(
199 "ucnhash", /* Module name */
200 ucnhash_methods, /* Method list */
201 ucnhash_docstring, /* Module doc-string */
202 (PyObject *)NULL, /* always pass this as *self */
203 PYTHON_API_VERSION); /* API Version */
204 if (!m)
205 return;
206
207 d = PyModule_GetDict(m);
208 if (!d)
209 return;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000210
211 /* Export C API */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000212 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
213 PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
214 Py_XDECREF(v);
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000215}