blob: 4711123eb2e1f4ea90c07f46d3b78908f0d5ca19 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode 3.0 data base.
4
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
6
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00009
Fredrik Lundhcfcea492000-09-25 08:07:06 +000010 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
12 ------------------------------------------------------------------------ */
13
14#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000015#include "ucnhash.h"
16
17/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000019typedef struct {
20 const unsigned char category; /* index into
21 _PyUnicode_CategoryNames */
22 const unsigned char combining; /* combining class value 0 - 255 */
23 const unsigned char bidirectional; /* index into
24 _PyUnicode_BidirectionalNames */
25 const unsigned char mirrored; /* true if mirrored in bidir mode */
26} _PyUnicode_DatabaseRecord;
27
28/* data file generated by Tools/unicode/makeunicodedata.py */
29#include "unicodedata_db.h"
30
31static const _PyUnicode_DatabaseRecord*
Fredrik Lundhb95896b2001-02-18 22:06:17 +000032_getrecord(PyUnicodeObject* v)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000033{
34 int code;
35 int index;
36
37 code = (int) *PyUnicode_AS_UNICODE(v);
38
39 if (code < 0 || code >= 65536)
40 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44 }
45
46 return &_PyUnicode_Database_Records[index];
47}
48
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000049/* --- Module API --------------------------------------------------------- */
50
51static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000052unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000053{
54 PyUnicodeObject *v;
55 PyObject *defobj = NULL;
56 long rc;
57
Fredrik Lundh06d12682001-01-24 07:59:11 +000058 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000060 if (PyUnicode_GET_SIZE(v) != 1) {
61 PyErr_SetString(PyExc_TypeError,
62 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000063 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000064 }
65 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
66 if (rc < 0) {
67 if (defobj == NULL) {
68 PyErr_SetString(PyExc_ValueError,
69 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000070 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000071 }
72 else {
73 Py_INCREF(defobj);
74 return defobj;
75 }
76 }
77 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000078}
79
80static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000081unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000082{
83 PyUnicodeObject *v;
84 PyObject *defobj = NULL;
85 long rc;
86
Fredrik Lundh06d12682001-01-24 07:59:11 +000087 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000088 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000089 if (PyUnicode_GET_SIZE(v) != 1) {
90 PyErr_SetString(PyExc_TypeError,
91 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000092 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000093 }
94 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
95 if (rc < 0) {
96 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +000097 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000098 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000099 }
100 else {
101 Py_INCREF(defobj);
102 return defobj;
103 }
104 }
105 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000106}
107
108static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000109unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110{
111 PyUnicodeObject *v;
112 PyObject *defobj = NULL;
113 double rc;
114
Fredrik Lundh06d12682001-01-24 07:59:11 +0000115 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000116 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117 if (PyUnicode_GET_SIZE(v) != 1) {
118 PyErr_SetString(PyExc_TypeError,
119 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000120 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000121 }
122 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
123 if (rc < 0) {
124 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000127 }
128 else {
129 Py_INCREF(defobj);
130 return defobj;
131 }
132 }
133 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000134}
135
136static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000138{
139 PyUnicodeObject *v;
140 int index;
141
142 if (!PyArg_ParseTuple(args, "O!:category",
143 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000145 if (PyUnicode_GET_SIZE(v) != 1) {
146 PyErr_SetString(PyExc_TypeError,
147 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000148 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000150 index = (int) _getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152}
153
154static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000155unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156{
157 PyUnicodeObject *v;
158 int index;
159
160 if (!PyArg_ParseTuple(args, "O!:bidirectional",
161 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000163 if (PyUnicode_GET_SIZE(v) != 1) {
164 PyErr_SetString(PyExc_TypeError,
165 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000166 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000168 index = (int) _getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170}
171
172static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174{
175 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176
177 if (!PyArg_ParseTuple(args, "O!:combining",
178 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000180 if (PyUnicode_GET_SIZE(v) != 1) {
181 PyErr_SetString(PyExc_TypeError,
182 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000184 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000185 return PyInt_FromLong((int) _getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186}
187
188static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000189unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190{
191 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192
193 if (!PyArg_ParseTuple(args, "O!:mirrored",
194 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000196 if (PyUnicode_GET_SIZE(v) != 1) {
197 PyErr_SetString(PyExc_TypeError,
198 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000199 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000200 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000201 return PyInt_FromLong((int) _getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202}
203
204static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206{
207 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 char decomp[256];
209 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
211 if (!PyArg_ParseTuple(args, "O!:decomposition",
212 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214 if (PyUnicode_GET_SIZE(v) != 1) {
215 PyErr_SetString(PyExc_TypeError,
216 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000217 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000218 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219
220 code = (int) *PyUnicode_AS_UNICODE(v);
221
222 if (code < 0 || code >= 65536)
223 index = 0;
224 else {
225 index = decomp_index1[(code>>DECOMP_SHIFT)];
226 index = decomp_index2[(index<<DECOMP_SHIFT)+
227 (code&((1<<DECOMP_SHIFT)-1))];
228 }
229
230 /* high byte is of hex bytes (usually one or two), low byte
231 is prefix code (from*/
232 count = decomp_data[index] >> 8;
233
234 /* XXX: could allocate the PyString up front instead
235 (strlen(prefix) + 5 * count + 1 bytes) */
236
237 /* copy prefix */
238 i = strlen(decomp_prefix[decomp_data[index] & 255]);
239 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
240
241 while (count-- > 0) {
242 if (i)
243 decomp[i++] = ' ';
244 sprintf(decomp + i, "%04X", decomp_data[++index]);
245 i += strlen(decomp + i);
246 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000247
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000248 decomp[i] = '\0';
249
250 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251}
252
Fredrik Lundh06d12682001-01-24 07:59:11 +0000253/* -------------------------------------------------------------------- */
254/* unicode character name tables */
255
256/* data file generated by Tools/unicode/makeunicodedata.py */
257#include "unicodename_db.h"
258
259/* -------------------------------------------------------------------- */
260/* database code (cut and pasted from the unidb package) */
261
262static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000263_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000264{
265 int i;
266 unsigned long h = 0;
267 unsigned long ix;
268 for (i = 0; i < len; i++) {
269 h = (h * scale) + (unsigned char) toupper(s[i]);
270 ix = h & 0xff000000;
271 if (ix)
272 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
273 }
274 return h;
275}
276
277static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000278_getname(Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000279{
280 int offset;
281 int i;
282 int word;
283 unsigned char* w;
284
Fred Drake6a16ea02001-07-19 21:11:13 +0000285 if (code >= 65536)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000286 return 0;
287
288 /* get offset into phrasebook */
289 offset = phrasebook_offset1[(code>>phrasebook_shift)];
290 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
291 (code&((1<<phrasebook_shift)-1))];
292 if (!offset)
293 return 0;
294
295 i = 0;
296
297 for (;;) {
298 /* get word index */
299 word = phrasebook[offset] - phrasebook_short;
300 if (word >= 0) {
301 word = (word << 8) + phrasebook[offset+1];
302 offset += 2;
303 } else
304 word = phrasebook[offset++];
305 if (i) {
306 if (i > buflen)
307 return 0; /* buffer overflow */
308 buffer[i++] = ' ';
309 }
310 /* copy word string from lexicon. the last character in the
311 word has bit 7 set. the last word in a string ends with
312 0x80 */
313 w = lexicon + lexicon_offset[word];
314 while (*w < 128) {
315 if (i >= buflen)
316 return 0; /* buffer overflow */
317 buffer[i++] = *w++;
318 }
319 if (i >= buflen)
320 return 0; /* buffer overflow */
321 buffer[i++] = *w & 127;
322 if (*w == 128)
323 break; /* end of word */
324 }
325
326 return 1;
327}
328
329static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000330_cmpname(int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000331{
332 /* check if code corresponds to the given name */
333 int i;
334 char buffer[NAME_MAXLEN];
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000335 if (!_getname(code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000336 return 0;
337 for (i = 0; i < namelen; i++) {
338 if (toupper(name[i]) != buffer[i])
339 return 0;
340 }
341 return buffer[namelen] == '\0';
342}
343
344static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000345_getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000346{
347 unsigned int h, v;
348 unsigned int mask = code_size-1;
349 unsigned int i, incr;
350
351 /* the following is the same as python's dictionary lookup, with
352 only minor changes. see the makeunicodedata script for more
353 details */
354
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000355 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000356 i = (~h) & mask;
357 v = code_hash[i];
358 if (!v)
359 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000360 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000361 *code = v;
362 return 1;
363 }
364 incr = (h ^ (h >> 3)) & mask;
365 if (!incr)
366 incr = mask;
367 for (;;) {
368 i = (i + incr) & mask;
369 v = code_hash[i];
370 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +0000371 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000372 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000373 *code = v;
374 return 1;
375 }
376 incr = incr << 1;
377 if (incr > mask)
378 incr = incr ^ code_poly;
379 }
380}
381
382static const _PyUnicode_Name_CAPI hashAPI =
383{
384 sizeof(_PyUnicode_Name_CAPI),
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000385 _getname,
386 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +0000387};
388
389/* -------------------------------------------------------------------- */
390/* Python bindings */
391
392static PyObject *
393unicodedata_name(PyObject* self, PyObject* args)
394{
395 char name[NAME_MAXLEN];
396
397 PyUnicodeObject* v;
398 PyObject* defobj = NULL;
399 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
400 return NULL;
401
402 if (PyUnicode_GET_SIZE(v) != 1) {
403 PyErr_SetString(PyExc_TypeError,
404 "need a single Unicode character as parameter");
405 return NULL;
406 }
407
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000408 if (!_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
409 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000410 if (defobj == NULL) {
411 PyErr_SetString(PyExc_ValueError, "no such name");
412 return NULL;
413 }
414 else {
415 Py_INCREF(defobj);
416 return defobj;
417 }
418 }
419
420 return Py_BuildValue("s", name);
421}
422
423static PyObject *
424unicodedata_lookup(PyObject* self, PyObject* args)
425{
426 Py_UCS4 code;
427 Py_UNICODE str[1];
428
429 char* name;
430 int namelen;
431 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
432 return NULL;
433
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000434 if (!_getcode(name, namelen, &code)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000435 PyErr_SetString(PyExc_KeyError, "undefined character name");
436 return NULL;
437 }
438
439 str[0] = (Py_UNICODE) code;
440 return PyUnicode_FromUnicode(str, 1);
441}
442
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000443/* XXX Add doc strings. */
444
445static PyMethodDef unicodedata_functions[] = {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000446 {"decimal", unicodedata_decimal, METH_VARARGS},
447 {"digit", unicodedata_digit, METH_VARARGS},
448 {"numeric", unicodedata_numeric, METH_VARARGS},
449 {"category", unicodedata_category, METH_VARARGS},
450 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
451 {"combining", unicodedata_combining, METH_VARARGS},
452 {"mirrored", unicodedata_mirrored, METH_VARARGS},
453 {"decomposition",unicodedata_decomposition, METH_VARARGS},
454 {"name", unicodedata_name, METH_VARARGS},
455 {"lookup", unicodedata_lookup, METH_VARARGS},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000456 {NULL, NULL} /* sentinel */
457};
458
Fredrik Lundh06d12682001-01-24 07:59:11 +0000459static char *unicodedata_docstring = "unicode character database";
460
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000461DL_EXPORT(void)
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000462initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000463{
Fredrik Lundh06d12682001-01-24 07:59:11 +0000464 PyObject *m, *d, *v;
465
Fred Drakef585bef2001-03-03 19:41:55 +0000466 m = Py_InitModule3(
467 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000468 if (!m)
469 return;
470
471 d = PyModule_GetDict(m);
472 if (!d)
473 return;
474
475 /* Export C API */
476 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakef585bef2001-03-03 19:41:55 +0000477 if (v != NULL) {
478 PyDict_SetItemString(d, "ucnhash_CAPI", v);
479 Py_DECREF(v);
480 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000481}