blob: 248300e7dd9cf2d5ae10d002e09c1dec1dbba5a7 [file] [log] [blame]
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00001/* unicode character name tables */
2/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
3
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00004#include "Python.h"
Guido van Rossum4f4b7992000-06-29 00:06:39 +00005#include "ucnhash.h"
Marc-André Lemburg8fb87482000-06-28 16:38:56 +00006
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +00007/* data file generated by Tools/unicode/makeunicodedata.py */
8#include "unicodename_db.h"
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000010/* -------------------------------------------------------------------- */
11/* database code (cut and pasted from the unidb package) */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000013static unsigned long
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014gethash(const char *s, int len, int scale)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000015{
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000016 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000017 unsigned long h = 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000018 unsigned long ix;
19 for (i = 0; i < len; i++) {
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000020 h = (h * scale) + (unsigned char) toupper(s[i]);
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000021 ix = h & 0xff000000;
22 if (ix)
23 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000024 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000025 return h;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000026}
27
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000028static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000029getname(Py_UCS4 code, char* buffer, int buflen)
Marc-André Lemburg8fb87482000-06-28 16:38:56 +000030{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000031 int offset;
Fredrik Lundhee865c62001-01-19 11:00:42 +000032 int i;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000033 int word;
34 unsigned char* w;
Fredrik Lundhee865c62001-01-19 11:00:42 +000035
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000036 if (code < 0 || code >= 65536)
37 return 0;
38
39 /* get offset into phrasebook */
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000040 offset = phrasebook_offset1[(code>>phrasebook_shift)];
41 offset = phrasebook_offset2[(offset<<phrasebook_shift)+
42 (code&((1<<phrasebook_shift)-1))];
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000043 if (!offset)
44 return 0;
45
46 i = 0;
47
48 for (;;) {
49 /* get word index */
50 if (phrasebook[offset] & 128) {
51 word = phrasebook[offset] & 127;
52 offset++;
53 } else {
54 word = (phrasebook[offset]<<8) + phrasebook[offset+1];
55 offset+=2;
Fredrik Lundhee865c62001-01-19 11:00:42 +000056 }
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000057 if (i) {
58 if (i > buflen)
59 return 0; /* buffer overflow */
60 buffer[i++] = ' ';
61 }
62 /* copy word string from lexicon. the last character in the
63 word has bit 7 set. the last word in a string ends with
64 0x80 */
65 w = lexicon + lexicon_offset[word];
66 while (*w < 128) {
67 if (i >= buflen)
68 return 0; /* buffer overflow */
69 buffer[i++] = *w++;
70 }
71 if (i >= buflen)
72 return 0; /* buffer overflow */
73 buffer[i++] = *w & 127;
74 if (*w == 128)
75 break; /* end of word */
76 }
Fredrik Lundhee865c62001-01-19 11:00:42 +000077
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000078 return 1;
79}
80
81static int
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000082cmpname(int code, const char* name, int namelen)
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000083{
84 /* check if code corresponds to the given name */
85 int i;
86 char buffer[NAME_MAXLEN];
87 if (!getname(code, buffer, sizeof(buffer)))
88 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000089 for (i = 0; i < namelen; i++) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000090 if (toupper(name[i]) != buffer[i])
91 return 0;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000092 }
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +000093 return buffer[namelen] == '\0';
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000094}
95
96static int
Fredrik Lundhee865c62001-01-19 11:00:42 +000097getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000098{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +000099 unsigned int h, v;
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000100 unsigned int mask = code_size-1;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000101 unsigned int i, incr;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000102
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000103 /* the following is the same as python's dictionary lookup, with
104 only minor changes. see the makeunicodedata script for more
105 details */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000106
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107 h = (unsigned int) gethash(name, namelen, code_magic);
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000108 i = (~h) & mask;
109 v = code_hash[i];
110 if (!v)
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000111 return 0;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000112 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000113 *code = v;
114 return 1;
115 }
116 incr = (h ^ (h >> 3)) & mask;
117 if (!incr)
118 incr = mask;
119 for (;;) {
120 i = (i + incr) & mask;
121 v = code_hash[i];
122 if (!v)
123 return -1;
Fredrik Lundh7c1e4bb2001-01-19 19:45:02 +0000124 if (cmpname(v, name, namelen)) {
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000125 *code = v;
126 return 1;
127 }
128 incr = incr << 1;
129 if (incr > mask)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000130 incr = incr ^ code_poly;
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000131 }
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000132}
133
134static const _PyUnicode_Name_CAPI hashAPI =
135{
136 sizeof(_PyUnicode_Name_CAPI),
Fredrik Lundhee865c62001-01-19 11:00:42 +0000137 getname,
138 getcode
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000139};
140
Fredrik Lundhee865c62001-01-19 11:00:42 +0000141/* -------------------------------------------------------------------- */
142/* Python bindings */
143
144static PyObject *
145ucnhash_getname(PyObject* self, PyObject* args)
146{
Fredrik Lundh95f1e6f2001-01-19 11:52:33 +0000147 char name[NAME_MAXLEN];
Fredrik Lundhee865c62001-01-19 11:00:42 +0000148
149 int code;
150 if (!PyArg_ParseTuple(args, "i", &code))
151 return NULL;
152
153 if (!getname((Py_UCS4) code, name, sizeof(name))) {
154 PyErr_SetString(PyExc_ValueError, "undefined character code");
155 return NULL;
156 }
157
158 return Py_BuildValue("s", name);
159}
160
161static PyObject *
162ucnhash_getcode(PyObject* self, PyObject* args)
163{
164 Py_UCS4 code;
165
166 char* name;
167 int namelen;
168 if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
169 return NULL;
170
171 if (!getcode(name, namelen, &code)) {
172 PyErr_SetString(PyExc_ValueError, "undefined character name");
173 return NULL;
174 }
175
176 return Py_BuildValue("i", code);
177}
178
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000179static
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000180PyMethodDef ucnhash_methods[] =
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000181{
Fredrik Lundhee865c62001-01-19 11:00:42 +0000182 {"getname", ucnhash_getname, 1},
183 {"getcode", ucnhash_getcode, 1},
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000184 {NULL, NULL},
185};
186
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000187static char *ucnhash_docstring = "ucnhash hash function module";
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000188
189
190/* Create PyMethodObjects and register them in the module's dict */
191DL_EXPORT(void)
192initucnhash(void)
193{
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000194 PyObject *m, *d, *v;
195
196 m = Py_InitModule4(
197 "ucnhash", /* Module name */
198 ucnhash_methods, /* Method list */
199 ucnhash_docstring, /* Module doc-string */
200 (PyObject *)NULL, /* always pass this as *self */
201 PYTHON_API_VERSION); /* API Version */
202 if (!m)
203 return;
204
205 d = PyModule_GetDict(m);
206 if (!d)
207 return;
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000208
209 /* Export C API */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +0000210 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
211 PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
212 Py_XDECREF(v);
Marc-André Lemburg8fb87482000-06-28 16:38:56 +0000213}