blob: 3620936d9fbc3598b58ac075959f1b34c509a300 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
17
18/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000019
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000020typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
27} _PyUnicode_DatabaseRecord;
28
29/* data file generated by Tools/unicode/makeunicodedata.py */
30#include "unicodedata_db.h"
31
32static const _PyUnicode_DatabaseRecord*
Fredrik Lundhb95896b2001-02-18 22:06:17 +000033_getrecord(PyUnicodeObject* v)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000034{
35 int code;
36 int index;
37
38 code = (int) *PyUnicode_AS_UNICODE(v);
39
Martin v. Löwis9def6a32002-10-18 16:11:54 +000040 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000041 index = 0;
42 else {
43 index = index1[(code>>SHIFT)];
44 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
45 }
46
47 return &_PyUnicode_Database_Records[index];
48}
49
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000050/* --- Module API --------------------------------------------------------- */
51
52static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000053unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000054{
55 PyUnicodeObject *v;
56 PyObject *defobj = NULL;
57 long rc;
58
Fredrik Lundh06d12682001-01-24 07:59:11 +000059 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000060 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000061 if (PyUnicode_GET_SIZE(v) != 1) {
62 PyErr_SetString(PyExc_TypeError,
63 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000065 }
66 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
67 if (rc < 0) {
68 if (defobj == NULL) {
69 PyErr_SetString(PyExc_ValueError,
70 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000071 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000072 }
73 else {
74 Py_INCREF(defobj);
75 return defobj;
76 }
77 }
78 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000079}
80
81static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000082unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000083{
84 PyUnicodeObject *v;
85 PyObject *defobj = NULL;
86 long rc;
87
Fredrik Lundh06d12682001-01-24 07:59:11 +000088 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000089 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000090 if (PyUnicode_GET_SIZE(v) != 1) {
91 PyErr_SetString(PyExc_TypeError,
92 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000093 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000094 }
95 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
96 if (rc < 0) {
97 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +000098 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000099 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000100 }
101 else {
102 Py_INCREF(defobj);
103 return defobj;
104 }
105 }
106 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107}
108
109static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000110unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000111{
112 PyUnicodeObject *v;
113 PyObject *defobj = NULL;
114 double rc;
115
Fredrik Lundh06d12682001-01-24 07:59:11 +0000116 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 if (PyUnicode_GET_SIZE(v) != 1) {
119 PyErr_SetString(PyExc_TypeError,
120 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 }
123 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
124 if (rc < 0) {
125 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000128 }
129 else {
130 Py_INCREF(defobj);
131 return defobj;
132 }
133 }
134 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135}
136
137static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000138unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000139{
140 PyUnicodeObject *v;
141 int index;
142
143 if (!PyArg_ParseTuple(args, "O!:category",
144 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000145 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (PyUnicode_GET_SIZE(v) != 1) {
147 PyErr_SetString(PyExc_TypeError,
148 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000150 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000151 index = (int) _getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153}
154
155static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000156unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157{
158 PyUnicodeObject *v;
159 int index;
160
161 if (!PyArg_ParseTuple(args, "O!:bidirectional",
162 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000164 if (PyUnicode_GET_SIZE(v) != 1) {
165 PyErr_SetString(PyExc_TypeError,
166 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000169 index = (int) _getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171}
172
173static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175{
176 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177
178 if (!PyArg_ParseTuple(args, "O!:combining",
179 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (PyUnicode_GET_SIZE(v) != 1) {
182 PyErr_SetString(PyExc_TypeError,
183 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000186 return PyInt_FromLong((int) _getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187}
188
189static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000190unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191{
192 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193
194 if (!PyArg_ParseTuple(args, "O!:mirrored",
195 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000196 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197 if (PyUnicode_GET_SIZE(v) != 1) {
198 PyErr_SetString(PyExc_TypeError,
199 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000202 return PyInt_FromLong((int) _getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203}
204
205static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000206unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207{
208 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000209 char decomp[256];
210 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211
212 if (!PyArg_ParseTuple(args, "O!:decomposition",
213 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000214 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000215 if (PyUnicode_GET_SIZE(v) != 1) {
216 PyErr_SetString(PyExc_TypeError,
217 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000218 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000219 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000220
221 code = (int) *PyUnicode_AS_UNICODE(v);
222
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000223 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000224 index = 0;
225 else {
226 index = decomp_index1[(code>>DECOMP_SHIFT)];
227 index = decomp_index2[(index<<DECOMP_SHIFT)+
228 (code&((1<<DECOMP_SHIFT)-1))];
229 }
230
Tim Peters69b83b12001-11-30 07:23:05 +0000231 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000232 is prefix code (from*/
233 count = decomp_data[index] >> 8;
234
235 /* XXX: could allocate the PyString up front instead
236 (strlen(prefix) + 5 * count + 1 bytes) */
237
238 /* copy prefix */
239 i = strlen(decomp_prefix[decomp_data[index] & 255]);
240 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
241
242 while (count-- > 0) {
243 if (i)
244 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000245 assert((size_t)i < sizeof(decomp));
246 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
247 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000248 i += strlen(decomp + i);
249 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251 decomp[i] = '\0';
252
253 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254}
255
Fredrik Lundh06d12682001-01-24 07:59:11 +0000256/* -------------------------------------------------------------------- */
257/* unicode character name tables */
258
259/* data file generated by Tools/unicode/makeunicodedata.py */
260#include "unicodename_db.h"
261
262/* -------------------------------------------------------------------- */
263/* database code (cut and pasted from the unidb package) */
264
265static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000266_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000267{
268 int i;
269 unsigned long h = 0;
270 unsigned long ix;
271 for (i = 0; i < len; i++) {
272 h = (h * scale) + (unsigned char) toupper(s[i]);
273 ix = h & 0xff000000;
274 if (ix)
275 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
276 }
277 return h;
278}
279
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000280#define SBase 0xAC00
281#define LBase 0x1100
282#define VBase 0x1161
283#define TBase 0x11A7
284#define LCount 19
285#define VCount 21
286#define TCount 28
287#define NCount (VCount*TCount)
288#define SCount (LCount*NCount)
289
290static char *hangul_syllables[][3] = {
291 { "G", "A", "" },
292 { "GG", "AE", "G" },
293 { "N", "YA", "GG" },
294 { "D", "YAE", "GS" },
295 { "DD", "EO", "N", },
296 { "R", "E", "NJ" },
297 { "M", "YEO", "NH" },
298 { "B", "YE", "D" },
299 { "BB", "O", "L" },
300 { "S", "WA", "LG" },
301 { "SS", "WAE", "LM" },
302 { "", "OE", "LB" },
303 { "J", "YO", "LS" },
304 { "JJ", "U", "LT" },
305 { "C", "WEO", "LP" },
306 { "K", "WE", "LH" },
307 { "T", "WI", "M" },
308 { "P", "YU", "B" },
309 { "H", "EU", "BS" },
310 { 0, "YI", "S" },
311 { 0, "I", "SS" },
312 { 0, 0, "NG" },
313 { 0, 0, "J" },
314 { 0, 0, "C" },
315 { 0, 0, "K" },
316 { 0, 0, "T" },
317 { 0, 0, "P" },
318 { 0, 0, "H" }
319};
320
Fredrik Lundh06d12682001-01-24 07:59:11 +0000321static int
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000322_getucname(Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000323{
324 int offset;
325 int i;
326 int word;
327 unsigned char* w;
328
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000329 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000330 /* Hangul syllable. */
331 int SIndex = code - SBase;
332 int L = SIndex / NCount;
333 int V = (SIndex % NCount) / TCount;
334 int T = SIndex % TCount;
335
336 if (buflen < 27)
337 /* Worst case: HANGUL SYLLABLE <10chars>. */
338 return 0;
339 strcpy(buffer, "HANGUL SYLLABLE ");
340 buffer += 16;
341 strcpy(buffer, hangul_syllables[L][0]);
342 buffer += strlen(hangul_syllables[L][0]);
343 strcpy(buffer, hangul_syllables[V][1]);
344 buffer += strlen(hangul_syllables[V][1]);
345 strcpy(buffer, hangul_syllables[T][2]);
346 buffer += strlen(hangul_syllables[T][2]);
347 *buffer = '\0';
348 return 1;
349 }
350
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000351 if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
352 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
353 (0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
354 if (buflen < 28)
355 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
356 return 0;
357 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
358 return 1;
359 }
360
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000361 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000362 return 0;
363
364 /* get offset into phrasebook */
365 offset = phrasebook_offset1[(code>>phrasebook_shift)];
366 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
367 (code&((1<<phrasebook_shift)-1))];
368 if (!offset)
369 return 0;
370
371 i = 0;
372
373 for (;;) {
374 /* get word index */
375 word = phrasebook[offset] - phrasebook_short;
376 if (word >= 0) {
377 word = (word << 8) + phrasebook[offset+1];
378 offset += 2;
379 } else
380 word = phrasebook[offset++];
381 if (i) {
382 if (i > buflen)
383 return 0; /* buffer overflow */
384 buffer[i++] = ' ';
385 }
386 /* copy word string from lexicon. the last character in the
387 word has bit 7 set. the last word in a string ends with
388 0x80 */
389 w = lexicon + lexicon_offset[word];
390 while (*w < 128) {
391 if (i >= buflen)
392 return 0; /* buffer overflow */
393 buffer[i++] = *w++;
394 }
395 if (i >= buflen)
396 return 0; /* buffer overflow */
397 buffer[i++] = *w & 127;
398 if (*w == 128)
399 break; /* end of word */
400 }
401
402 return 1;
403}
404
405static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000406_cmpname(int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000407{
408 /* check if code corresponds to the given name */
409 int i;
410 char buffer[NAME_MAXLEN];
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000411 if (!_getucname(code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000412 return 0;
413 for (i = 0; i < namelen; i++) {
414 if (toupper(name[i]) != buffer[i])
415 return 0;
416 }
417 return buffer[namelen] == '\0';
418}
419
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000420static void
421find_syllable(const char *str, int *len, int *pos, int count, int column)
422{
423 int i, len1;
424 *len = -1;
425 for (i = 0; i < count; i++) {
426 char *s = hangul_syllables[i][column];
427 len1 = strlen(s);
428 if (len1 <= *len)
429 continue;
430 if (strncmp(str, s, len1) == 0) {
431 *len = len1;
432 *pos = i;
433 }
434 }
435 if (*len == -1) {
436 *len = 0;
437 *pos = -1;
438 }
439}
440
Fredrik Lundh06d12682001-01-24 07:59:11 +0000441static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000442_getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000443{
444 unsigned int h, v;
445 unsigned int mask = code_size-1;
446 unsigned int i, incr;
447
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000448 /* Check for hangul syllables. */
449 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
450 int L, V, T, len;
451 const char *pos = name + 16;
452 find_syllable(pos, &len, &L, LCount, 0);
453 pos += len;
454 find_syllable(pos, &len, &V, VCount, 1);
455 pos += len;
456 find_syllable(pos, &len, &T, TCount, 2);
457 pos += len;
458 if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
459 *code = SBase + (L*VCount+V)*TCount + T;
460 return 1;
461 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000462 /* Otherwise, it's an illegal syllable name. */
463 return 0;
464 }
465
466 /* Check for unified ideographs. */
467 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
468 /* Four or five hexdigits must follow. */
469 v = 0;
470 name += 22;
471 namelen -= 22;
472 if (namelen != 4 && namelen != 5)
473 return 0;
474 while (namelen--) {
475 v *= 16;
476 if (*name >= '0' && *name <= '9')
477 v += *name - '0';
478 else if (*name >= 'A' && *name <= 'F')
479 v += *name - 'A' + 10;
480 else
481 return 0;
482 name++;
483 }
484 *code = v;
485 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000486 }
487
Fredrik Lundh06d12682001-01-24 07:59:11 +0000488 /* the following is the same as python's dictionary lookup, with
489 only minor changes. see the makeunicodedata script for more
490 details */
491
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000492 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000493 i = (~h) & mask;
494 v = code_hash[i];
495 if (!v)
496 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000497 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000498 *code = v;
499 return 1;
500 }
501 incr = (h ^ (h >> 3)) & mask;
502 if (!incr)
503 incr = mask;
504 for (;;) {
505 i = (i + incr) & mask;
506 v = code_hash[i];
507 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +0000508 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000509 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000510 *code = v;
511 return 1;
512 }
513 incr = incr << 1;
514 if (incr > mask)
515 incr = incr ^ code_poly;
516 }
517}
518
519static const _PyUnicode_Name_CAPI hashAPI =
520{
521 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000522 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000523 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +0000524};
525
526/* -------------------------------------------------------------------- */
527/* Python bindings */
528
529static PyObject *
530unicodedata_name(PyObject* self, PyObject* args)
531{
532 char name[NAME_MAXLEN];
533
534 PyUnicodeObject* v;
535 PyObject* defobj = NULL;
536 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
537 return NULL;
538
539 if (PyUnicode_GET_SIZE(v) != 1) {
540 PyErr_SetString(PyExc_TypeError,
541 "need a single Unicode character as parameter");
542 return NULL;
543 }
544
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000545 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000546 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000547 if (defobj == NULL) {
548 PyErr_SetString(PyExc_ValueError, "no such name");
549 return NULL;
550 }
551 else {
552 Py_INCREF(defobj);
553 return defobj;
554 }
555 }
556
557 return Py_BuildValue("s", name);
558}
559
560static PyObject *
561unicodedata_lookup(PyObject* self, PyObject* args)
562{
563 Py_UCS4 code;
564 Py_UNICODE str[1];
565
566 char* name;
567 int namelen;
568 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
569 return NULL;
570
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000571 if (!_getcode(name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000572 char fmt[] = "undefined character name '%s'";
573 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
574 sprintf(buf, fmt, name);
575 PyErr_SetString(PyExc_KeyError, buf);
576 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000577 return NULL;
578 }
579
580 str[0] = (Py_UNICODE) code;
581 return PyUnicode_FromUnicode(str, 1);
582}
583
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000584/* XXX Add doc strings. */
585
586static PyMethodDef unicodedata_functions[] = {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000587 {"decimal", unicodedata_decimal, METH_VARARGS},
588 {"digit", unicodedata_digit, METH_VARARGS},
589 {"numeric", unicodedata_numeric, METH_VARARGS},
590 {"category", unicodedata_category, METH_VARARGS},
591 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
592 {"combining", unicodedata_combining, METH_VARARGS},
593 {"mirrored", unicodedata_mirrored, METH_VARARGS},
594 {"decomposition",unicodedata_decomposition, METH_VARARGS},
595 {"name", unicodedata_name, METH_VARARGS},
596 {"lookup", unicodedata_lookup, METH_VARARGS},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000597 {NULL, NULL} /* sentinel */
598};
599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +0000600PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
Fredrik Lundh06d12682001-01-24 07:59:11 +0000601
Mark Hammond62b1ab12002-07-23 06:31:15 +0000602PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000603initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000604{
Fred Drakea2bd8d32002-04-03 21:39:26 +0000605 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +0000606
Fred Drakef585bef2001-03-03 19:41:55 +0000607 m = Py_InitModule3(
608 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000609 if (!m)
610 return;
611
Fredrik Lundh06d12682001-01-24 07:59:11 +0000612 /* Export C API */
613 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +0000614 if (v != NULL)
615 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000616}
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000617
618/*
619Local variables:
620c-basic-offset: 4
621End:
622*/