blob: 311db296bdbcecacdef9c44e98be8ae79ef45fe1 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
17
18/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000019
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000020typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
27} _PyUnicode_DatabaseRecord;
28
29/* data file generated by Tools/unicode/makeunicodedata.py */
30#include "unicodedata_db.h"
31
32static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000033_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000034{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000035 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000036 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037 index = 0;
38 else {
39 index = index1[(code>>SHIFT)];
40 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
41 }
42
43 return &_PyUnicode_Database_Records[index];
44}
45
Martin v. Löwis677bde22002-11-23 22:08:15 +000046static const _PyUnicode_DatabaseRecord*
47_getrecord(PyUnicodeObject* v)
48{
49 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
50}
51
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000052/* --- Module API --------------------------------------------------------- */
53
54static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000055unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000056{
57 PyUnicodeObject *v;
58 PyObject *defobj = NULL;
59 long rc;
60
Fredrik Lundh06d12682001-01-24 07:59:11 +000061 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000062 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000063 if (PyUnicode_GET_SIZE(v) != 1) {
64 PyErr_SetString(PyExc_TypeError,
65 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000066 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000067 }
68 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
69 if (rc < 0) {
70 if (defobj == NULL) {
71 PyErr_SetString(PyExc_ValueError,
72 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000073 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000074 }
75 else {
76 Py_INCREF(defobj);
77 return defobj;
78 }
79 }
80 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000081}
82
83static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000084unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000085{
86 PyUnicodeObject *v;
87 PyObject *defobj = NULL;
88 long rc;
89
Fredrik Lundh06d12682001-01-24 07:59:11 +000090 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000091 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000092 if (PyUnicode_GET_SIZE(v) != 1) {
93 PyErr_SetString(PyExc_TypeError,
94 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000095 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000096 }
97 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
98 if (rc < 0) {
99 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000100 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000101 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000102 }
103 else {
104 Py_INCREF(defobj);
105 return defobj;
106 }
107 }
108 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000109}
110
111static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000112unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000113{
114 PyUnicodeObject *v;
115 PyObject *defobj = NULL;
116 double rc;
117
Fredrik Lundh06d12682001-01-24 07:59:11 +0000118 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000119 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000120 if (PyUnicode_GET_SIZE(v) != 1) {
121 PyErr_SetString(PyExc_TypeError,
122 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000123 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124 }
125 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
126 if (rc < 0) {
127 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000128 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000130 }
131 else {
132 Py_INCREF(defobj);
133 return defobj;
134 }
135 }
136 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000137}
138
139static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000141{
142 PyUnicodeObject *v;
143 int index;
144
145 if (!PyArg_ParseTuple(args, "O!:category",
146 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000147 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000148 if (PyUnicode_GET_SIZE(v) != 1) {
149 PyErr_SetString(PyExc_TypeError,
150 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000153 index = (int) _getrecord(v)->category;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000154 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000155}
156
157static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000158unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159{
160 PyUnicodeObject *v;
161 int index;
162
163 if (!PyArg_ParseTuple(args, "O!:bidirectional",
164 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166 if (PyUnicode_GET_SIZE(v) != 1) {
167 PyErr_SetString(PyExc_TypeError,
168 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000171 index = (int) _getrecord(v)->bidirectional;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000173}
174
175static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177{
178 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000179
180 if (!PyArg_ParseTuple(args, "O!:combining",
181 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 if (PyUnicode_GET_SIZE(v) != 1) {
184 PyErr_SetString(PyExc_TypeError,
185 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000188 return PyInt_FromLong((int) _getrecord(v)->combining);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189}
190
191static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193{
194 PyUnicodeObject *v;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195
196 if (!PyArg_ParseTuple(args, "O!:mirrored",
197 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000198 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000199 if (PyUnicode_GET_SIZE(v) != 1) {
200 PyErr_SetString(PyExc_TypeError,
201 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000204 return PyInt_FromLong((int) _getrecord(v)->mirrored);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205}
206
207static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209{
210 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 char decomp[256];
212 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213
214 if (!PyArg_ParseTuple(args, "O!:decomposition",
215 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000216 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217 if (PyUnicode_GET_SIZE(v) != 1) {
218 PyErr_SetString(PyExc_TypeError,
219 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000220 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000222
223 code = (int) *PyUnicode_AS_UNICODE(v);
224
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000225 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000226 index = 0;
227 else {
228 index = decomp_index1[(code>>DECOMP_SHIFT)];
229 index = decomp_index2[(index<<DECOMP_SHIFT)+
230 (code&((1<<DECOMP_SHIFT)-1))];
231 }
232
Tim Peters69b83b12001-11-30 07:23:05 +0000233 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000234 is prefix code (from*/
235 count = decomp_data[index] >> 8;
236
237 /* XXX: could allocate the PyString up front instead
238 (strlen(prefix) + 5 * count + 1 bytes) */
239
240 /* copy prefix */
241 i = strlen(decomp_prefix[decomp_data[index] & 255]);
242 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
243
244 while (count-- > 0) {
245 if (i)
246 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000247 assert((size_t)i < sizeof(decomp));
248 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
249 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000250 i += strlen(decomp + i);
251 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253 decomp[i] = '\0';
254
255 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256}
257
Martin v. Löwis677bde22002-11-23 22:08:15 +0000258void
259get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
260{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000261 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000262 *index = 0;
263 }
264 else {
265 *index = decomp_index1[(code>>DECOMP_SHIFT)];
266 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
267 (code&((1<<DECOMP_SHIFT)-1))];
268 }
269
270 /* high byte is number of hex bytes (usually one or two), low byte
271 is prefix code (from*/
272 *count = decomp_data[*index] >> 8;
273 *prefix = decomp_data[*index] & 255;
274
275 (*index)++;
276}
277
278#define SBase 0xAC00
279#define LBase 0x1100
280#define VBase 0x1161
281#define TBase 0x11A7
282#define LCount 19
283#define VCount 21
284#define TCount 28
285#define NCount (VCount*TCount)
286#define SCount (LCount*NCount)
287
288static PyObject*
289nfd_nfkd(PyObject *input, int k)
290{
291 PyObject *result;
292 Py_UNICODE *i, *end, *o;
293 /* Longest decomposition in Unicode 3.2: U+FDFA */
294 Py_UNICODE stack[20];
295 int space, stackptr, isize;
296 int index, prefix, count;
297 unsigned char prev, cur;
298
299 stackptr = 0;
300 isize = PyUnicode_GET_SIZE(input);
301 /* Overallocate atmost 10 characters. */
302 space = (isize > 10 ? 10 : isize) + isize;
303 result = PyUnicode_FromUnicode(NULL, space);
304 if (!result)
305 return NULL;
306 i = PyUnicode_AS_UNICODE(input);
307 end = i + isize;
308 o = PyUnicode_AS_UNICODE(result);
309
310 while (i < end) {
311 stack[stackptr++] = *i++;
312 while(stackptr) {
313 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000314 /* Hangul Decomposition adds three characters in
315 a single step, so we need atleast that much room. */
316 if (space < 3) {
317 int newsize = PyString_GET_SIZE(result) + 10;
318 space += 10;
319 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000320 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000321 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000322 }
323 /* Hangul Decomposition. */
324 if (SBase <= code && code < (SBase+SCount)) {
325 int SIndex = code - SBase;
326 int L = LBase + SIndex / NCount;
327 int V = VBase + (SIndex % NCount) / TCount;
328 int T = TBase + SIndex % TCount;
329 *o++ = L;
330 *o++ = V;
331 space -= 2;
332 if (T != TBase) {
333 *o++ = T;
334 space --;
335 }
336 continue;
337 }
338 /* Other decompoistions. */
339 get_decomp_record(code, &index, &prefix, &count);
340
341 /* Copy character if it is not decomposable, or has a
342 compatibility decomposition, but we do NFD. */
343 if (!count || (prefix && !k)) {
344 *o++ = code;
345 space--;
346 continue;
347 }
348 /* Copy decomposition onto the stack, in reverse
349 order. */
350 while(count) {
351 code = decomp_data[index + (--count)];
352 stack[stackptr++] = code;
353 }
354 }
355 }
356
357 /* Drop overallocation. Cannot fail. */
358 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
359
360 /* Sort canonically. */
361 i = PyUnicode_AS_UNICODE(result);
362 prev = _getrecord_ex(*i)->combining;
363 end = i + PyUnicode_GET_SIZE(result);
364 for (i++; i < end; i++) {
365 cur = _getrecord_ex(*i)->combining;
366 if (prev == 0 || cur == 0 || prev <= cur) {
367 prev = cur;
368 continue;
369 }
370 /* Non-canonical order. Need to switch *i with previous. */
371 o = i - 1;
372 while (1) {
373 Py_UNICODE tmp = o[1];
374 o[1] = o[0];
375 o[0] = tmp;
376 o--;
377 if (o < PyUnicode_AS_UNICODE(result))
378 break;
379 prev = _getrecord_ex(*o)->combining;
380 if (prev == 0 || prev <= cur)
381 break;
382 }
383 prev = _getrecord_ex(*i)->combining;
384 }
385 return result;
386}
387
388static int
389find_nfc_index(struct reindex* nfc, Py_UNICODE code)
390{
391 int index;
392 for (index = 0; nfc[index].start; index++) {
393 int start = nfc[index].start;
394 if (code < start)
395 return -1;
396 if (code <= start + nfc[index].count) {
397 int delta = code - start;
398 return nfc[index].index + delta;
399 }
400 }
401 return -1;
402}
403
404static PyObject*
405nfc_nfkc(PyObject *input, int k)
406{
407 PyObject *result;
408 Py_UNICODE *i, *i1, *o, *end;
409 int f,l,index,index1,comb;
410 Py_UNICODE code;
411 Py_UNICODE *skipped[20];
412 int cskipped = 0;
413
414 result = nfd_nfkd(input, k);
415 if (!result)
416 return NULL;
417
418 /* We are going to modify result in-place.
419 If nfd_nfkd is changed to sometimes return the input,
420 this code needs to be reviewed. */
421 assert(result != input);
422
423 i = PyUnicode_AS_UNICODE(result);
424 end = i + PyUnicode_GET_SIZE(result);
425 o = PyUnicode_AS_UNICODE(result);
426
427 again:
428 while (i < end) {
429 for (index = 0; index < cskipped; index++) {
430 if (skipped[index] == i) {
431 /* *i character is skipped.
432 Remove from list. */
433 skipped[index] = skipped[cskipped-1];
434 cskipped--;
435 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000436 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000437 }
438 }
439 /* Hangul Composition. We don't need to check for <LV,T>
440 pairs, since we always have decomposed data. */
441 if (LBase <= *i && *i < (LBase+LCount) &&
442 i + 1 < end &&
443 VBase <= i[1] && i[1] <= (VBase+VCount)) {
444 int LIndex, VIndex;
445 LIndex = i[0] - LBase;
446 VIndex = i[1] - VBase;
447 code = SBase + (LIndex*VCount+VIndex)*TCount;
448 i+=2;
449 if (i < end &&
450 TBase <= *i && *i <= (TBase+TCount)) {
451 code += *i-TBase;
452 i++;
453 }
454 *o++ = code;
455 continue;
456 }
457
458 f = find_nfc_index(nfc_first, *i);
459 if (f == -1) {
460 *o++ = *i++;
461 continue;
462 }
463 /* Find next unblocked character. */
464 i1 = i+1;
465 comb = 0;
466 while (i1 < end) {
467 int comb1 = _getrecord_ex(*i1)->combining;
468 if (comb1 && comb == comb1) {
469 /* Character is blocked. */
470 i1++;
471 continue;
472 }
473 l = find_nfc_index(nfc_last, *i1);
474 /* *i1 cannot be combined with *i. If *i1
475 is a starter, we don't need to look further.
476 Otherwise, record the combining class. */
477 if (l == -1) {
478 not_combinable:
479 if (comb1 == 0)
480 break;
481 comb = comb1;
482 i1++;
483 continue;
484 }
485 index = f*TOTAL_LAST + l;
486 index1 = comp_index[index >> COMP_SHIFT];
487 code = comp_data[(index1<<COMP_SHIFT)+
488 (index&((1<<COMP_SHIFT)-1))];
489 if (code == 0)
490 goto not_combinable;
491
492 /* Replace the original character. */
493 *i = code;
494 /* Mark the second character unused. */
495 skipped[cskipped++] = i1;
496 i1++;
497 f = find_nfc_index(nfc_first, *i);
498 if (f == -1)
499 break;
500 }
501 *o++ = *i++;
502 }
503 if (o != end)
504 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
505 return result;
506}
507
508static PyObject*
509unicodedata_normalize(PyObject *self, PyObject *args)
510{
511 char *form;
512 PyObject *input;
513
514 if(!PyArg_ParseTuple(args, "sO!:normalized",
515 &form, &PyUnicode_Type, &input))
516 return NULL;
517
518 if (strcmp(form, "NFC") == 0)
519 return nfc_nfkc(input, 0);
520 if (strcmp(form, "NFKC") == 0)
521 return nfc_nfkc(input, 1);
522 if (strcmp(form, "NFD") == 0)
523 return nfd_nfkd(input, 0);
524 if (strcmp(form, "NFKD") == 0)
525 return nfd_nfkd(input, 1);
526 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
527 return NULL;
528}
529
Fredrik Lundh06d12682001-01-24 07:59:11 +0000530/* -------------------------------------------------------------------- */
531/* unicode character name tables */
532
533/* data file generated by Tools/unicode/makeunicodedata.py */
534#include "unicodename_db.h"
535
536/* -------------------------------------------------------------------- */
537/* database code (cut and pasted from the unidb package) */
538
539static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000540_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000541{
542 int i;
543 unsigned long h = 0;
544 unsigned long ix;
545 for (i = 0; i < len; i++) {
546 h = (h * scale) + (unsigned char) toupper(s[i]);
547 ix = h & 0xff000000;
548 if (ix)
549 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
550 }
551 return h;
552}
553
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000554static char *hangul_syllables[][3] = {
555 { "G", "A", "" },
556 { "GG", "AE", "G" },
557 { "N", "YA", "GG" },
558 { "D", "YAE", "GS" },
559 { "DD", "EO", "N", },
560 { "R", "E", "NJ" },
561 { "M", "YEO", "NH" },
562 { "B", "YE", "D" },
563 { "BB", "O", "L" },
564 { "S", "WA", "LG" },
565 { "SS", "WAE", "LM" },
566 { "", "OE", "LB" },
567 { "J", "YO", "LS" },
568 { "JJ", "U", "LT" },
569 { "C", "WEO", "LP" },
570 { "K", "WE", "LH" },
571 { "T", "WI", "M" },
572 { "P", "YU", "B" },
573 { "H", "EU", "BS" },
574 { 0, "YI", "S" },
575 { 0, "I", "SS" },
576 { 0, 0, "NG" },
577 { 0, 0, "J" },
578 { 0, 0, "C" },
579 { 0, 0, "K" },
580 { 0, 0, "T" },
581 { 0, 0, "P" },
582 { 0, 0, "H" }
583};
584
Fredrik Lundh06d12682001-01-24 07:59:11 +0000585static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000586is_unified_ideograph(Py_UCS4 code)
587{
588 return (
589 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
590 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
591 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
592}
593
594static int
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000595_getucname(Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000596{
597 int offset;
598 int i;
599 int word;
600 unsigned char* w;
601
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000602 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000603 /* Hangul syllable. */
604 int SIndex = code - SBase;
605 int L = SIndex / NCount;
606 int V = (SIndex % NCount) / TCount;
607 int T = SIndex % TCount;
608
609 if (buflen < 27)
610 /* Worst case: HANGUL SYLLABLE <10chars>. */
611 return 0;
612 strcpy(buffer, "HANGUL SYLLABLE ");
613 buffer += 16;
614 strcpy(buffer, hangul_syllables[L][0]);
615 buffer += strlen(hangul_syllables[L][0]);
616 strcpy(buffer, hangul_syllables[V][1]);
617 buffer += strlen(hangul_syllables[V][1]);
618 strcpy(buffer, hangul_syllables[T][2]);
619 buffer += strlen(hangul_syllables[T][2]);
620 *buffer = '\0';
621 return 1;
622 }
623
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000624 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000625 if (buflen < 28)
626 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
627 return 0;
628 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
629 return 1;
630 }
631
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000632 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000633 return 0;
634
635 /* get offset into phrasebook */
636 offset = phrasebook_offset1[(code>>phrasebook_shift)];
637 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
638 (code&((1<<phrasebook_shift)-1))];
639 if (!offset)
640 return 0;
641
642 i = 0;
643
644 for (;;) {
645 /* get word index */
646 word = phrasebook[offset] - phrasebook_short;
647 if (word >= 0) {
648 word = (word << 8) + phrasebook[offset+1];
649 offset += 2;
650 } else
651 word = phrasebook[offset++];
652 if (i) {
653 if (i > buflen)
654 return 0; /* buffer overflow */
655 buffer[i++] = ' ';
656 }
657 /* copy word string from lexicon. the last character in the
658 word has bit 7 set. the last word in a string ends with
659 0x80 */
660 w = lexicon + lexicon_offset[word];
661 while (*w < 128) {
662 if (i >= buflen)
663 return 0; /* buffer overflow */
664 buffer[i++] = *w++;
665 }
666 if (i >= buflen)
667 return 0; /* buffer overflow */
668 buffer[i++] = *w & 127;
669 if (*w == 128)
670 break; /* end of word */
671 }
672
673 return 1;
674}
675
676static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000677_cmpname(int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000678{
679 /* check if code corresponds to the given name */
680 int i;
681 char buffer[NAME_MAXLEN];
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000682 if (!_getucname(code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000683 return 0;
684 for (i = 0; i < namelen; i++) {
685 if (toupper(name[i]) != buffer[i])
686 return 0;
687 }
688 return buffer[namelen] == '\0';
689}
690
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000691static void
692find_syllable(const char *str, int *len, int *pos, int count, int column)
693{
694 int i, len1;
695 *len = -1;
696 for (i = 0; i < count; i++) {
697 char *s = hangul_syllables[i][column];
698 len1 = strlen(s);
699 if (len1 <= *len)
700 continue;
701 if (strncmp(str, s, len1) == 0) {
702 *len = len1;
703 *pos = i;
704 }
705 }
706 if (*len == -1) {
707 *len = 0;
708 *pos = -1;
709 }
710}
711
Fredrik Lundh06d12682001-01-24 07:59:11 +0000712static int
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000713_getcode(const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000714{
715 unsigned int h, v;
716 unsigned int mask = code_size-1;
717 unsigned int i, incr;
718
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000719 /* Check for hangul syllables. */
720 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
721 int L, V, T, len;
722 const char *pos = name + 16;
723 find_syllable(pos, &len, &L, LCount, 0);
724 pos += len;
725 find_syllable(pos, &len, &V, VCount, 1);
726 pos += len;
727 find_syllable(pos, &len, &T, TCount, 2);
728 pos += len;
729 if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
730 *code = SBase + (L*VCount+V)*TCount + T;
731 return 1;
732 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000733 /* Otherwise, it's an illegal syllable name. */
734 return 0;
735 }
736
737 /* Check for unified ideographs. */
738 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
739 /* Four or five hexdigits must follow. */
740 v = 0;
741 name += 22;
742 namelen -= 22;
743 if (namelen != 4 && namelen != 5)
744 return 0;
745 while (namelen--) {
746 v *= 16;
747 if (*name >= '0' && *name <= '9')
748 v += *name - '0';
749 else if (*name >= 'A' && *name <= 'F')
750 v += *name - 'A' + 10;
751 else
752 return 0;
753 name++;
754 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000755 if (!is_unified_ideograph(v))
756 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000757 *code = v;
758 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000759 }
760
Fredrik Lundh06d12682001-01-24 07:59:11 +0000761 /* the following is the same as python's dictionary lookup, with
762 only minor changes. see the makeunicodedata script for more
763 details */
764
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000765 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000766 i = (~h) & mask;
767 v = code_hash[i];
768 if (!v)
769 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000770 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000771 *code = v;
772 return 1;
773 }
774 incr = (h ^ (h >> 3)) & mask;
775 if (!incr)
776 incr = mask;
777 for (;;) {
778 i = (i + incr) & mask;
779 v = code_hash[i];
780 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +0000781 return 0;
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000782 if (_cmpname(v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000783 *code = v;
784 return 1;
785 }
786 incr = incr << 1;
787 if (incr > mask)
788 incr = incr ^ code_poly;
789 }
790}
791
792static const _PyUnicode_Name_CAPI hashAPI =
793{
794 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000795 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000796 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +0000797};
798
799/* -------------------------------------------------------------------- */
800/* Python bindings */
801
802static PyObject *
803unicodedata_name(PyObject* self, PyObject* args)
804{
805 char name[NAME_MAXLEN];
806
807 PyUnicodeObject* v;
808 PyObject* defobj = NULL;
809 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
810 return NULL;
811
812 if (PyUnicode_GET_SIZE(v) != 1) {
813 PyErr_SetString(PyExc_TypeError,
814 "need a single Unicode character as parameter");
815 return NULL;
816 }
817
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +0000818 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000819 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000820 if (defobj == NULL) {
821 PyErr_SetString(PyExc_ValueError, "no such name");
822 return NULL;
823 }
824 else {
825 Py_INCREF(defobj);
826 return defobj;
827 }
828 }
829
830 return Py_BuildValue("s", name);
831}
832
833static PyObject *
834unicodedata_lookup(PyObject* self, PyObject* args)
835{
836 Py_UCS4 code;
837 Py_UNICODE str[1];
838
839 char* name;
840 int namelen;
841 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
842 return NULL;
843
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000844 if (!_getcode(name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000845 char fmt[] = "undefined character name '%s'";
846 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
847 sprintf(buf, fmt, name);
848 PyErr_SetString(PyExc_KeyError, buf);
849 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000850 return NULL;
851 }
852
853 str[0] = (Py_UNICODE) code;
854 return PyUnicode_FromUnicode(str, 1);
855}
856
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000857/* XXX Add doc strings. */
858
859static PyMethodDef unicodedata_functions[] = {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000860 {"decimal", unicodedata_decimal, METH_VARARGS},
861 {"digit", unicodedata_digit, METH_VARARGS},
862 {"numeric", unicodedata_numeric, METH_VARARGS},
863 {"category", unicodedata_category, METH_VARARGS},
864 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
865 {"combining", unicodedata_combining, METH_VARARGS},
866 {"mirrored", unicodedata_mirrored, METH_VARARGS},
867 {"decomposition",unicodedata_decomposition, METH_VARARGS},
868 {"name", unicodedata_name, METH_VARARGS},
869 {"lookup", unicodedata_lookup, METH_VARARGS},
Martin v. Löwis677bde22002-11-23 22:08:15 +0000870 {"normalize", unicodedata_normalize, METH_VARARGS},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000871 {NULL, NULL} /* sentinel */
872};
873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +0000874PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
Fredrik Lundh06d12682001-01-24 07:59:11 +0000875
Mark Hammond62b1ab12002-07-23 06:31:15 +0000876PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000877initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000878{
Fred Drakea2bd8d32002-04-03 21:39:26 +0000879 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +0000880
Fred Drakef585bef2001-03-03 19:41:55 +0000881 m = Py_InitModule3(
882 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000883 if (!m)
884 return;
885
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000886 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
887
Fredrik Lundh06d12682001-01-24 07:59:11 +0000888 /* Export C API */
889 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +0000890 if (v != NULL)
891 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000892}
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000893
894/*
895Local variables:
896c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +0000897indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000898End:
899*/